chiark / gitweb /
journal: split user logs into their own journal files
[elogind.git] / src / journal / sd-journal.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "sd-journal.h"
31 #include "journal-def.h"
32 #include "journal-private.h"
33 #include "lookup3.h"
34 #include "list.h"
35 #include "hashmap.h"
36
37 #define DEFAULT_ARENA_MAX_SIZE (16ULL*1024ULL*1024ULL*1024ULL)
38 #define DEFAULT_ARENA_MIN_SIZE (256ULL*1024ULL)
39 #define DEFAULT_ARENA_KEEP_FREE (1ULL*1024ULL*1024ULL)
40
41 #define DEFAULT_HASH_TABLE_SIZE (2047ULL*16ULL)
42 #define DEFAULT_BISECT_TABLE_SIZE ((DEFAULT_ARENA_MAX_SIZE/(64ULL*1024ULL))*8ULL)
43
44 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
45
46 struct sd_journal {
47         Hashmap *files;
48 };
49
50 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
51
52 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
53
54 void journal_file_close(JournalFile *f) {
55         assert(f);
56
57         if (f->fd >= 0)
58                 close_nointr_nofail(f->fd);
59
60         if (f->header)
61                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
62
63         if (f->hash_table_window)
64                 munmap(f->hash_table_window, f->hash_table_window_size);
65
66         if (f->bisect_table_window)
67                 munmap(f->bisect_table_window, f->bisect_table_window_size);
68
69         if (f->window)
70                 munmap(f->window, f->window_size);
71
72         free(f->path);
73         free(f);
74 }
75
76 static int journal_file_init_header(JournalFile *f) {
77         Header h;
78         ssize_t k;
79         int r;
80
81         assert(f);
82
83         zero(h);
84         memcpy(h.signature, signature, 8);
85         h.arena_offset = htole64(ALIGN64(sizeof(h)));
86         h.arena_max_size = htole64(DEFAULT_ARENA_MAX_SIZE);
87         h.arena_min_size = htole64(DEFAULT_ARENA_MIN_SIZE);
88         h.arena_keep_free = htole64(DEFAULT_ARENA_KEEP_FREE);
89
90         r = sd_id128_randomize(&h.file_id);
91         if (r < 0)
92                 return r;
93
94         k = pwrite(f->fd, &h, sizeof(h), 0);
95         if (k < 0)
96                 return -errno;
97
98         if (k != sizeof(h))
99                 return -EIO;
100
101         return 0;
102 }
103
104 static int journal_file_refresh_header(JournalFile *f) {
105         int r;
106
107         assert(f);
108
109         r = sd_id128_get_machine(&f->header->machine_id);
110         if (r < 0)
111                 return r;
112
113         r = sd_id128_get_boot(&f->header->boot_id);
114         if (r < 0)
115                 return r;
116
117         f->header->state = htole32(STATE_ONLINE);
118         return 0;
119 }
120
121 static int journal_file_verify_header(JournalFile *f) {
122         assert(f);
123
124         if (memcmp(f->header, signature, 8))
125                 return -EBADMSG;
126
127         if (f->header->incompatible_flags != 0)
128                 return -EPROTONOSUPPORT;
129
130         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
131                 return -ENODATA;
132
133         if (f->writable) {
134                 uint32_t state;
135                 sd_id128_t machine_id;
136                 int r;
137
138                 r = sd_id128_get_machine(&machine_id);
139                 if (r < 0)
140                         return r;
141
142                 if (!sd_id128_equal(machine_id, f->header->machine_id))
143                         return -EHOSTDOWN;
144
145                 state = le32toh(f->header->state);
146
147                 if (state == STATE_ONLINE)
148                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
149                 else if (state == STATE_ARCHIVED)
150                         return -ESHUTDOWN;
151                 else if (state != STATE_OFFLINE)
152                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
153         }
154
155         return 0;
156 }
157
158 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
159         uint64_t asize;
160         uint64_t old_size, new_size;
161
162         assert(f);
163
164         if (offset < le64toh(f->header->arena_offset))
165                 return -EINVAL;
166
167         new_size = PAGE_ALIGN(offset + size);
168
169         /* We assume that this file is not sparse, and we know that
170          * for sure, since we alway call posix_fallocate()
171          * ourselves */
172
173         old_size =
174                 le64toh(f->header->arena_offset) +
175                 le64toh(f->header->arena_size);
176
177         if (old_size >= new_size)
178                 return 0;
179
180         asize = new_size - le64toh(f->header->arena_offset);
181
182         if (asize > le64toh(f->header->arena_min_size)) {
183                 struct statvfs svfs;
184
185                 if (fstatvfs(f->fd, &svfs) >= 0) {
186                         uint64_t available;
187
188                         available = svfs.f_bfree * svfs.f_bsize;
189
190                         if (available >= f->header->arena_keep_free)
191                                 available -= f->header->arena_keep_free;
192                         else
193                                 available = 0;
194
195                         if (new_size - old_size > available)
196                                 return -E2BIG;
197                 }
198         }
199
200         if (asize > le64toh(f->header->arena_max_size))
201                 return -E2BIG;
202
203         if (posix_fallocate(f->fd, 0, new_size) < 0)
204                 return -errno;
205
206         if (fstat(f->fd, &f->last_stat) < 0)
207                 return -errno;
208
209         f->header->arena_size = htole64(asize);
210
211         return 0;
212 }
213
214 static int journal_file_map(
215                 JournalFile *f,
216                 uint64_t offset,
217                 uint64_t size,
218                 void **_window,
219                 uint64_t *_woffset,
220                 uint64_t *_wsize,
221                 void **ret) {
222
223         uint64_t woffset, wsize;
224         void *window;
225
226         assert(f);
227         assert(size > 0);
228         assert(ret);
229
230         woffset = offset & ~((uint64_t) page_size() - 1ULL);
231         wsize = size + (offset - woffset);
232         wsize = PAGE_ALIGN(wsize);
233
234         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
235         if (window == MAP_FAILED)
236                 return -errno;
237
238         if (_window)
239                 *_window = window;
240
241         if (_woffset)
242                 *_woffset = woffset;
243
244         if (_wsize)
245                 *_wsize = wsize;
246
247         *ret = (uint8_t*) window + (offset - woffset);
248
249         return 0;
250 }
251
252 static int journal_file_move_to(JournalFile *f, uint64_t offset, uint64_t size, void **ret) {
253         void *p;
254         uint64_t delta;
255         int r;
256
257         assert(f);
258         assert(ret);
259
260         if (_likely_(f->window &&
261                      f->window_offset <= offset &&
262                      f->window_offset+f->window_size >= offset + size)) {
263
264                 *ret = (uint8_t*) f->window + (offset - f->window_offset);
265                 return 0;
266         }
267
268         if (f->window) {
269                 if (munmap(f->window, f->window_size) < 0)
270                         return -errno;
271
272                 f->window = NULL;
273                 f->window_size = f->window_offset = 0;
274         }
275
276         if (size < DEFAULT_WINDOW_SIZE) {
277                 /* If the default window size is larger then what was
278                  * asked for extend the mapping a bit in the hope to
279                  * minimize needed remappings later on. We add half
280                  * the window space before and half behind the
281                  * requested mapping */
282
283                 delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2);
284
285                 if (offset < delta)
286                         delta = offset;
287
288                 offset -= delta;
289                 size += (DEFAULT_WINDOW_SIZE - delta);
290         } else
291                 delta = 0;
292
293         r = journal_file_map(f,
294                              offset, size,
295                              &f->window, &f->window_offset, &f->window_size,
296                              & p);
297
298         if (r < 0)
299                 return r;
300
301         *ret = (uint8_t*) p + delta;
302         return 0;
303 }
304
305 static bool verify_hash(Object *o) {
306         uint64_t t;
307
308         assert(o);
309
310         t = le64toh(o->object.type);
311         if (t == OBJECT_DATA) {
312                 uint64_t s, h1, h2;
313
314                 s = le64toh(o->object.size);
315
316                 h1 = le64toh(o->data.hash);
317                 h2 = hash64(o->data.payload, s - offsetof(Object, data.payload));
318
319                 return h1 == h2;
320         }
321
322         return true;
323 }
324
325 int journal_file_move_to_object(JournalFile *f, uint64_t offset, Object **ret) {
326         int r;
327         void *t;
328         Object *o;
329         uint64_t s;
330
331         assert(f);
332         assert(ret);
333
334         r = journal_file_move_to(f, offset, sizeof(ObjectHeader), &t);
335         if (r < 0)
336                 return r;
337
338         o = (Object*) t;
339         s = le64toh(o->object.size);
340
341         if (s < sizeof(ObjectHeader))
342                 return -EBADMSG;
343
344         if (s > sizeof(ObjectHeader)) {
345                 r = journal_file_move_to(f, offset, s, &t);
346                 if (r < 0)
347                         return r;
348
349                 o = (Object*) t;
350         }
351
352         if (!verify_hash(o))
353                 return -EBADMSG;
354
355         *ret = o;
356         return 0;
357 }
358
359 static uint64_t journal_file_seqnum(JournalFile *f) {
360         uint64_t r;
361
362         assert(f);
363
364         r = le64toh(f->header->seqnum) + 1;
365         f->header->seqnum = htole64(r);
366
367         return r;
368 }
369
370 static int journal_file_append_object(JournalFile *f, uint64_t size, Object **ret, uint64_t *offset) {
371         int r;
372         uint64_t p;
373         Object *tail, *o;
374         void *t;
375
376         assert(f);
377         assert(size >= sizeof(ObjectHeader));
378         assert(offset);
379         assert(ret);
380
381         p = le64toh(f->header->tail_object_offset);
382
383         if (p == 0)
384                 p = le64toh(f->header->arena_offset);
385         else {
386                 r = journal_file_move_to_object(f, p, &tail);
387                 if (r < 0)
388                         return r;
389
390                 p += ALIGN64(le64toh(tail->object.size));
391         }
392
393         r = journal_file_allocate(f, p, size);
394         if (r < 0)
395                 return r;
396
397         r = journal_file_move_to(f, p, size, &t);
398         if (r < 0)
399                 return r;
400
401         o = (Object*) t;
402
403         zero(o->object);
404         o->object.type = htole64(OBJECT_UNUSED);
405         zero(o->object.reserved);
406         o->object.size = htole64(size);
407
408         f->header->tail_object_offset = htole64(p);
409         if (f->header->head_object_offset == 0)
410                 f->header->head_object_offset = htole64(p);
411
412         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
413
414         *ret = o;
415         *offset = p;
416
417         return 0;
418 }
419
420 static int journal_file_setup_hash_table(JournalFile *f) {
421         uint64_t s, p;
422         Object *o;
423         int r;
424
425         assert(f);
426
427         s = DEFAULT_HASH_TABLE_SIZE;
428         r = journal_file_append_object(f, offsetof(Object, hash_table.table) + s, &o, &p);
429         if (r < 0)
430                 return r;
431
432         o->object.type = htole64(OBJECT_HASH_TABLE);
433         memset(o->hash_table.table, 0, s);
434
435         f->header->hash_table_offset = htole64(p + offsetof(Object, hash_table.table));
436         f->header->hash_table_size = htole64(s);
437
438         return 0;
439 }
440
441 static int journal_file_setup_bisect_table(JournalFile *f) {
442         uint64_t s, p;
443         Object *o;
444         int r;
445
446         assert(f);
447
448         s = DEFAULT_BISECT_TABLE_SIZE;
449         r = journal_file_append_object(f, offsetof(Object, bisect_table.table) + s, &o, &p);
450         if (r < 0)
451                 return r;
452
453         o->object.type = htole64(OBJECT_BISECT_TABLE);
454         memset(o->bisect_table.table, 0, s);
455
456         f->header->bisect_table_offset = htole64(p + offsetof(Object, bisect_table.table));
457         f->header->bisect_table_size = htole64(s);
458
459         return 0;
460 }
461
462 static int journal_file_map_hash_table(JournalFile *f) {
463         uint64_t s, p;
464         void *t;
465         int r;
466
467         assert(f);
468
469         p = le64toh(f->header->hash_table_offset);
470         s = le64toh(f->header->hash_table_size);
471
472         r = journal_file_map(f,
473                              p, s,
474                              &f->hash_table_window, NULL, &f->hash_table_window_size,
475                              &t);
476         if (r < 0)
477                 return r;
478
479         f->hash_table = t;
480         return 0;
481 }
482
483 static int journal_file_map_bisect_table(JournalFile *f) {
484         uint64_t s, p;
485         void *t;
486         int r;
487
488         assert(f);
489
490         p = le64toh(f->header->bisect_table_offset);
491         s = le64toh(f->header->bisect_table_size);
492
493         r = journal_file_map(f,
494                              p, s,
495                              &f->bisect_table_window, NULL, &f->bisect_table_window_size,
496                              &t);
497
498         if (r < 0)
499                 return r;
500
501         f->bisect_table = t;
502         return 0;
503 }
504
505 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash_index) {
506         uint64_t p;
507         int r;
508
509         assert(f);
510         assert(o);
511         assert(offset > 0);
512         assert(o->object.type == htole64(OBJECT_DATA));
513
514         o->data.head_entry_offset = o->data.tail_entry_offset = 0;
515         o->data.next_hash_offset = 0;
516
517         p = le64toh(f->hash_table[hash_index].tail_hash_offset);
518         if (p == 0) {
519                 /* Only entry in the hash table is easy */
520
521                 o->data.prev_hash_offset = 0;
522                 f->hash_table[hash_index].head_hash_offset = htole64(offset);
523         } else {
524                 o->data.prev_hash_offset = htole64(p);
525
526                 /* Temporarily move back to the previous data object,
527                  * to patch in pointer */
528
529                 r = journal_file_move_to_object(f, p, &o);
530                 if (r < 0)
531                         return r;
532
533                 o->data.next_hash_offset = offset;
534
535                 r = journal_file_move_to_object(f, offset, &o);
536                 if (r < 0)
537                         return r;
538         }
539
540         f->hash_table[hash_index].tail_hash_offset = htole64(offset);
541
542         return 0;
543 }
544
545 static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
546         uint64_t hash, h, p, np;
547         uint64_t osize;
548         Object *o;
549         int r;
550
551         assert(f);
552         assert(data || size == 0);
553
554         osize = offsetof(Object, data.payload) + size;
555
556         hash = hash64(data, size);
557         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
558         p = le64toh(f->hash_table[h].head_hash_offset);
559
560         while (p != 0) {
561                 /* Look for this data object in the hash table */
562
563                 r = journal_file_move_to_object(f, p, &o);
564                 if (r < 0)
565                         return r;
566
567                 if (le64toh(o->object.type) != OBJECT_DATA)
568                         return -EBADMSG;
569
570                 if (le64toh(o->object.size) == osize &&
571                     memcmp(o->data.payload, data, size) == 0) {
572
573                         if (le64toh(o->data.hash) != hash)
574                                 return -EBADMSG;
575
576                         if (ret)
577                                 *ret = o;
578
579                         if (offset)
580                                 *offset = p;
581
582                         return 0;
583                 }
584
585                 p = le64toh(o->data.next_hash_offset);
586         }
587
588         r = journal_file_append_object(f, osize, &o, &np);
589         if (r < 0)
590                 return r;
591
592         o->object.type = htole64(OBJECT_DATA);
593         o->data.hash = htole64(hash);
594         memcpy(o->data.payload, data, size);
595
596         r = journal_file_link_data(f, o, np, h);
597         if (r < 0)
598                 return r;
599
600         if (ret)
601                 *ret = o;
602
603         if (offset)
604                 *offset = np;
605
606         return 0;
607 }
608
609 uint64_t journal_file_entry_n_items(Object *o) {
610         assert(o);
611         assert(o->object.type == htole64(OBJECT_ENTRY));
612
613         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
614 }
615
616 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
617         uint64_t p, q;
618         int r;
619         assert(f);
620         assert(o);
621         assert(offset > 0);
622
623         p = le64toh(o->entry.items[i].object_offset);
624         if (p == 0)
625                 return -EINVAL;
626
627         o->entry.items[i].next_entry_offset = 0;
628
629         /* Move to the data object */
630         r = journal_file_move_to_object(f, p, &o);
631         if (r < 0)
632                 return r;
633
634         if (o->object.type != htole64(OBJECT_DATA))
635                 return -EBADMSG;
636
637         q = le64toh(o->data.tail_entry_offset);
638         o->data.tail_entry_offset = htole64(offset);
639
640         if (q == 0)
641                 o->data.head_entry_offset = htole64(offset);
642         else {
643                 uint64_t n, j;
644
645                 /* Move to previous entry */
646                 r = journal_file_move_to_object(f, q, &o);
647                 if (r < 0)
648                         return r;
649
650                 if (o->object.type != htole64(OBJECT_ENTRY))
651                         return -EBADMSG;
652
653                 n = journal_file_entry_n_items(o);
654                 for (j = 0; j < n; j++)
655                         if (le64toh(o->entry.items[j].object_offset) == p)
656                                 break;
657
658                 if (j >= n)
659                         return -EBADMSG;
660
661                 o->entry.items[j].next_entry_offset = offset;
662         }
663
664         /* Move back to original entry */
665         r = journal_file_move_to_object(f, offset, &o);
666         if (r < 0)
667                 return r;
668
669         o->entry.items[i].prev_entry_offset = q;
670         return 0;
671 }
672
673 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
674         uint64_t p, i, n, k, a, b;
675         int r;
676
677         assert(f);
678         assert(o);
679         assert(offset > 0);
680         assert(o->object.type == htole64(OBJECT_ENTRY));
681
682         /* Link up the entry itself */
683         p = le64toh(f->header->tail_entry_offset);
684
685         o->entry.prev_entry_offset = f->header->tail_entry_offset;
686         o->entry.next_entry_offset = 0;
687
688         if (p == 0)
689                 f->header->head_entry_offset = htole64(offset);
690         else {
691                 /* Temporarily move back to the previous entry, to
692                  * patch in pointer */
693
694                 r = journal_file_move_to_object(f, p, &o);
695                 if (r < 0)
696                         return r;
697
698                 o->entry.next_entry_offset = htole64(offset);
699
700                 r = journal_file_move_to_object(f, offset, &o);
701                 if (r < 0)
702                         return r;
703         }
704
705         f->header->tail_entry_offset = htole64(offset);
706
707         /* Link up the items */
708         n = journal_file_entry_n_items(o);
709         for (i = 0; i < n; i++) {
710                 r = journal_file_link_entry_item(f, o, offset, i);
711                 if (r < 0)
712                         return r;
713         }
714
715         /* Link up the entry in the bisect table */
716         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
717         k = le64toh(f->header->arena_max_size) / n;
718
719         a = (le64toh(f->header->last_bisect_offset) + k - 1) / k;
720         b = offset / k;
721
722         for (; a <= b; a++)
723                 f->bisect_table[a] = htole64(offset);
724
725         f->header->last_bisect_offset = htole64(offset + le64toh(o->object.size));
726
727         return 0;
728 }
729
730 static int journal_file_append_entry_internal(
731                 JournalFile *f,
732                 const dual_timestamp *ts,
733                 uint64_t xor_hash,
734                 const EntryItem items[], unsigned n_items,
735                 Object **ret, uint64_t *offset) {
736         uint64_t np;
737         uint64_t osize;
738         Object *o;
739         int r;
740
741         assert(f);
742         assert(items || n_items == 0);
743
744         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
745
746         r = journal_file_append_object(f, osize, &o, &np);
747         if (r < 0)
748                 return r;
749
750         o->object.type = htole64(OBJECT_ENTRY);
751         o->entry.seqnum = htole64(journal_file_seqnum(f));
752         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
753         o->entry.realtime = ts ? htole64(ts->realtime) : 0;
754         o->entry.monotonic = ts ? htole64(ts->monotonic) : 0;
755         o->entry.xor_hash = htole64(xor_hash);
756
757         r = journal_file_link_entry(f, o, np);
758         if (r < 0)
759                 return r;
760
761         if (ret)
762                 *ret = o;
763
764         if (offset)
765                 *offset = np;
766
767         return 0;
768 }
769
770 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, Object **ret, uint64_t *offset) {
771         unsigned i;
772         EntryItem *items;
773         int r;
774         uint64_t xor_hash = 0;
775
776         assert(f);
777         assert(iovec || n_iovec == 0);
778
779         items = new(EntryItem, n_iovec);
780         if (!items)
781                 return -ENOMEM;
782
783         for (i = 0; i < n_iovec; i++) {
784                 uint64_t p;
785                 Object *o;
786
787                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
788                 if (r < 0)
789                         goto finish;
790
791                 xor_hash ^= le64toh(o->data.hash);
792                 items[i].object_offset = htole64(p);
793         }
794
795         r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, ret, offset);
796
797 finish:
798         free(items);
799
800         return r;
801 }
802
803 int journal_file_move_to_entry(JournalFile *f, uint64_t seqnum, Object **ret, uint64_t *offset) {
804         Object *o;
805         uint64_t lower, upper, p, n, k;
806         int r;
807
808         assert(f);
809
810         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
811         k = le64toh(f->header->arena_max_size) / n;
812
813         lower = 0;
814         upper = le64toh(f->header->last_bisect_offset)/k+1;
815
816         while (lower < upper) {
817                 k = (upper + lower) / 2;
818                 p = le64toh(f->bisect_table[k]);
819
820                 if (p == 0) {
821                         upper = k;
822                         continue;
823                 }
824
825                 r = journal_file_move_to_object(f, p, &o);
826                 if (r < 0)
827                         return r;
828
829                 if (o->object.type != htole64(OBJECT_ENTRY))
830                         return -EBADMSG;
831
832                 if (o->entry.seqnum == seqnum) {
833                         if (ret)
834                                 *ret = o;
835
836                         if (offset)
837                                 *offset = p;
838
839                         return 1;
840                 } else if (seqnum < o->entry.seqnum)
841                         upper = k;
842                 else if (seqnum > o->entry.seqnum)
843                         lower = k+1;
844         }
845
846         assert(lower == upper);
847
848         if (lower <= 0)
849                 return 0;
850
851         /* The object we are looking for is between
852          * bisect_table[lower-1] and bisect_table[lower] */
853
854         p = le64toh(f->bisect_table[lower-1]);
855
856         for (;;) {
857                 r = journal_file_move_to_object(f, p, &o);
858                 if (r < 0)
859                         return r;
860
861                 if (o->entry.seqnum == seqnum) {
862                         if (ret)
863                                 *ret = o;
864
865                         if (offset)
866                                 *offset = p;
867
868                         return 1;
869
870                 } if (seqnum < o->entry.seqnum)
871                         return 0;
872
873                 if (o->entry.next_entry_offset == 0)
874                         return 0;
875
876                 p = le64toh(o->entry.next_entry_offset);
877         }
878
879         return 0;
880 }
881
882 int journal_file_next_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
883         uint64_t np;
884         int r;
885
886         assert(f);
887
888         if (!o)
889                 np = le64toh(f->header->head_entry_offset);
890         else {
891                 if (le64toh(o->object.type) != OBJECT_ENTRY)
892                         return -EINVAL;
893
894                 np = le64toh(o->entry.next_entry_offset);
895         }
896
897         if (np == 0)
898                 return 0;
899
900         r = journal_file_move_to_object(f, np, &o);
901         if (r < 0)
902                 return r;
903
904         if (le64toh(o->object.type) != OBJECT_ENTRY)
905                 return -EBADMSG;
906
907         if (ret)
908                 *ret = o;
909
910         if (offset)
911                 *offset = np;
912
913         return 1;
914 }
915
916 int journal_file_prev_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
917         uint64_t np;
918         int r;
919
920         assert(f);
921
922         if (!o)
923                 np = le64toh(f->header->tail_entry_offset);
924         else {
925                 if (le64toh(o->object.type) != OBJECT_ENTRY)
926                         return -EINVAL;
927
928                 np = le64toh(o->entry.prev_entry_offset);
929         }
930
931         if (np == 0)
932                 return 0;
933
934         r = journal_file_move_to_object(f, np, &o);
935         if (r < 0)
936                 return r;
937
938         if (le64toh(o->object.type) != OBJECT_ENTRY)
939                 return -EBADMSG;
940
941         if (ret)
942                 *ret = o;
943
944         if (offset)
945                 *offset = np;
946
947         return 1;
948 }
949
950 int journal_file_find_first_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
951         uint64_t p, osize, hash, h;
952         int r;
953
954         assert(f);
955         assert(data || size == 0);
956
957         osize = offsetof(Object, data.payload) + size;
958
959         hash = hash64(data, size);
960         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
961         p = le64toh(f->hash_table[h].head_hash_offset);
962
963         while (p != 0) {
964                 Object *o;
965
966                 r = journal_file_move_to_object(f, p, &o);
967                 if (r < 0)
968                         return r;
969
970                 if (le64toh(o->object.type) != OBJECT_DATA)
971                         return -EBADMSG;
972
973                 if (le64toh(o->object.size) == osize &&
974                     memcmp(o->data.payload, data, size) == 0) {
975
976                         if (le64toh(o->data.hash) != hash)
977                                 return -EBADMSG;
978
979                         if (o->data.head_entry_offset == 0)
980                                 return 0;
981
982                         p = le64toh(o->data.head_entry_offset);
983                         r = journal_file_move_to_object(f, p, &o);
984                         if (r < 0)
985                                 return r;
986
987                         if (le64toh(o->object.type) != OBJECT_ENTRY)
988                                 return -EBADMSG;
989
990                         if (ret)
991                                 *ret = o;
992
993                         if (offset)
994                                 *offset = p;
995
996                         return 1;
997                 }
998
999                 p = le64toh(o->data.next_hash_offset);
1000         }
1001
1002         return 0;
1003 }
1004
1005 int journal_file_find_last_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
1006         uint64_t p, osize, hash, h;
1007         int r;
1008
1009         assert(f);
1010         assert(data || size == 0);
1011
1012         osize = offsetof(Object, data.payload) + size;
1013
1014         hash = hash64(data, size);
1015         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
1016         p = le64toh(f->hash_table[h].tail_hash_offset);
1017
1018         while (p != 0) {
1019                 Object *o;
1020
1021                 r = journal_file_move_to_object(f, p, &o);
1022                 if (r < 0)
1023                         return r;
1024
1025                 if (le64toh(o->object.type) != OBJECT_DATA)
1026                         return -EBADMSG;
1027
1028                 if (le64toh(o->object.size) == osize &&
1029                     memcmp(o->data.payload, data, size) == 0) {
1030
1031                         if (le64toh(o->data.hash) != hash)
1032                                 return -EBADMSG;
1033
1034                         if (o->data.tail_entry_offset == 0)
1035                                 return 0;
1036
1037                         p = le64toh(o->data.tail_entry_offset);
1038                         r = journal_file_move_to_object(f, p, &o);
1039                         if (r < 0)
1040                                 return r;
1041
1042                         if (le64toh(o->object.type) != OBJECT_ENTRY)
1043                                 return -EBADMSG;
1044
1045                         if (ret)
1046                                 *ret = o;
1047
1048                         if (offset)
1049                                 *offset = p;
1050
1051                         return 1;
1052                 }
1053
1054                 p = le64toh(o->data.prev_hash_offset);
1055         }
1056
1057         return 0;
1058 }
1059
1060 void journal_file_dump(JournalFile *f) {
1061         char a[33], b[33], c[33];
1062         Object *o;
1063         int r;
1064         uint64_t p;
1065
1066         assert(f);
1067
1068         printf("File ID: %s\n"
1069                "Machine ID: %s\n"
1070                "Boot ID: %s\n"
1071                "Arena size: %llu\n",
1072                sd_id128_to_string(f->header->file_id, a),
1073                sd_id128_to_string(f->header->machine_id, b),
1074                sd_id128_to_string(f->header->boot_id, c),
1075                (unsigned long long) le64toh(f->header->arena_size));
1076
1077         p = le64toh(f->header->head_object_offset);
1078         while (p != 0) {
1079                 r = journal_file_move_to_object(f, p, &o);
1080                 if (r < 0)
1081                         goto fail;
1082
1083                 switch (o->object.type) {
1084
1085                 case OBJECT_UNUSED:
1086                         printf("Type: OBJECT_UNUSED\n");
1087                         break;
1088
1089                 case OBJECT_DATA:
1090                         printf("Type: OBJECT_DATA\n");
1091                         break;
1092
1093                 case OBJECT_ENTRY:
1094                         printf("Type: OBJECT_ENTRY %llu\n", (unsigned long long) le64toh(o->entry.seqnum));
1095                         break;
1096
1097                 case OBJECT_HASH_TABLE:
1098                         printf("Type: OBJECT_HASH_TABLE\n");
1099                         break;
1100
1101                 case OBJECT_BISECT_TABLE:
1102                         printf("Type: OBJECT_BISECT_TABLE\n");
1103                         break;
1104                 }
1105
1106                 if (p == le64toh(f->header->tail_object_offset))
1107                         p = 0;
1108                 else
1109                         p = p + ALIGN64(le64toh(o->object.size));
1110         }
1111
1112         return;
1113 fail:
1114         log_error("File corrupt");
1115 }
1116
1117 int journal_file_open(
1118                 const char *fname,
1119                 int flags,
1120                 mode_t mode,
1121                 JournalFile **ret) {
1122
1123         JournalFile *f;
1124         int r;
1125         bool newly_created = false;
1126
1127         assert(fname);
1128
1129         if ((flags & O_ACCMODE) != O_RDONLY &&
1130             (flags & O_ACCMODE) != O_RDWR)
1131                 return -EINVAL;
1132
1133         f = new0(JournalFile, 1);
1134         if (!f)
1135                 return -ENOMEM;
1136
1137         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1138         f->prot = prot_from_flags(flags);
1139
1140         f->fd = open(fname, flags|O_CLOEXEC, mode);
1141         if (f->fd < 0) {
1142                 r = -errno;
1143                 goto fail;
1144         }
1145
1146         f->path = strdup(fname);
1147         if (!f->path) {
1148                 r = -ENOMEM;
1149                 goto fail;
1150         }
1151
1152         if (fstat(f->fd, &f->last_stat) < 0) {
1153                 r = -errno;
1154                 goto fail;
1155         }
1156
1157         if (f->last_stat.st_size == 0 && f->writable) {
1158                 newly_created = true;
1159
1160                 r = journal_file_init_header(f);
1161                 if (r < 0)
1162                         goto fail;
1163
1164                 if (fstat(f->fd, &f->last_stat) < 0) {
1165                         r = -errno;
1166                         goto fail;
1167                 }
1168         }
1169
1170         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1171                 r = -EIO;
1172                 goto fail;
1173         }
1174
1175         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1176         if (f->header == MAP_FAILED) {
1177                 f->header = NULL;
1178                 r = -errno;
1179                 goto fail;
1180         }
1181
1182         if (!newly_created) {
1183                 r = journal_file_verify_header(f);
1184                 if (r < 0)
1185                         goto fail;
1186         }
1187
1188         if (f->writable) {
1189                 r = journal_file_refresh_header(f);
1190                 if (r < 0)
1191                         goto fail;
1192         }
1193
1194         if (newly_created) {
1195
1196                 r = journal_file_setup_hash_table(f);
1197                 if (r < 0)
1198                         goto fail;
1199
1200                 r = journal_file_setup_bisect_table(f);
1201                 if (r < 0)
1202                         goto fail;
1203         }
1204
1205         r = journal_file_map_hash_table(f);
1206         if (r < 0)
1207                 goto fail;
1208
1209         r = journal_file_map_bisect_table(f);
1210         if (r < 0)
1211                 goto fail;
1212
1213         if (ret)
1214                 *ret = f;
1215
1216         return 0;
1217
1218 fail:
1219         journal_file_close(f);
1220
1221         return r;
1222 }
1223
1224 int sd_journal_open(sd_journal **ret) {
1225         sd_journal *j;
1226         char *fn;
1227         const char *p;
1228         int r = 0;
1229         const char search_paths[] =
1230                 "/run/log/journal\0"
1231                 "/var/log/journal\0";
1232
1233         assert(ret);
1234
1235         j = new0(sd_journal, 1);
1236         if (!j)
1237                 return -ENOMEM;
1238
1239         j->files = hashmap_new(string_hash_func, string_compare_func);
1240         if (!j->files)
1241                 goto fail;
1242
1243         NULSTR_FOREACH(p, search_paths) {
1244                 DIR *d;
1245
1246                 d = opendir(p);
1247                 if (!d) {
1248                         if (errno != ENOENT && r == 0)
1249                                 r = -errno;
1250
1251                         continue;
1252                 }
1253
1254                 for (;;) {
1255                         struct dirent buf, *de;
1256                         int k;
1257                         JournalFile *f;
1258
1259                         k = readdir_r(d, &buf, &de);
1260                         if (k != 0) {
1261                                 if (r == 0)
1262                                         r = -k;
1263
1264                                 break;
1265                         }
1266
1267                         if (!de)
1268                                 break;
1269
1270                         if (!dirent_is_file_with_suffix(de, ".journal"))
1271                                 continue;
1272
1273                         fn = join(p, "/", de->d_name, NULL);
1274                         if (!fn) {
1275                                 r = -ENOMEM;
1276                                 closedir(d);
1277                                 goto fail;
1278                         }
1279
1280                         k = journal_file_open(fn, O_RDONLY, 0, &f);
1281                         free(fn);
1282
1283                         if (k < 0) {
1284
1285                                 if (r == 0)
1286                                         r = -k;
1287                         } else {
1288                                 k = hashmap_put(j->files, f->path, f);
1289                                 if (k < 0) {
1290                                         journal_file_close(f);
1291                                         closedir(d);
1292
1293                                         r = k;
1294                                         goto fail;
1295                                 }
1296                         }
1297                 }
1298         }
1299
1300         *ret = j;
1301         return 0;
1302
1303 fail:
1304         sd_journal_close(j);
1305
1306         return r;
1307 };
1308
1309 void sd_journal_close(sd_journal *j) {
1310         assert(j);
1311
1312         if (j->files) {
1313                 JournalFile *f;
1314
1315                 while ((f = hashmap_steal_first(j->files)))
1316                         journal_file_close(f);
1317
1318                 hashmap_free(j->files);
1319         }
1320
1321         free(j);
1322 }