chiark / gitweb /
f1dd92927c10cb087428b584ad2c696deccbc571
[elogind.git] / src / journal / sd-journal.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2011 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "sd-journal.h"
31 #include "journal-def.h"
32 #include "journal-private.h"
33 #include "lookup3.h"
34 #include "list.h"
35
36 #define DEFAULT_ARENA_MAX_SIZE (16ULL*1024ULL*1024ULL*1024ULL)
37 #define DEFAULT_ARENA_MIN_SIZE (256ULL*1024ULL)
38 #define DEFAULT_ARENA_KEEP_FREE (1ULL*1024ULL*1024ULL)
39
40 #define DEFAULT_HASH_TABLE_SIZE (2047ULL*16ULL)
41 #define DEFAULT_BISECT_TABLE_SIZE ((DEFAULT_ARENA_MAX_SIZE/(64ULL*1024ULL))*8ULL)
42
43 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
44
45 struct JournalFile {
46         sd_journal *journal;
47
48         int fd;
49         char *path;
50         struct stat last_stat;
51         int prot;
52         bool writable;
53
54         Header *header;
55
56         HashItem *hash_table;
57         void *hash_table_window;
58         uint64_t hash_table_window_size;
59
60         uint64_t *bisect_table;
61         void *bisect_table_window;
62         uint64_t bisect_table_window_size;
63
64         void *window;
65         uint64_t window_offset;
66         uint64_t window_size;
67
68         Object *current;
69         uint64_t current_offset;
70
71         LIST_FIELDS(JournalFile, files);
72 };
73
74 struct sd_journal {
75         LIST_HEAD(JournalFile, files);
76 };
77
78 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
79
80 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
81
82 void journal_file_close(JournalFile *f) {
83         assert(f);
84
85         if (f->journal)
86                 LIST_REMOVE(JournalFile, files, f->journal->files, f);
87
88         if (f->fd >= 0)
89                 close_nointr_nofail(f->fd);
90
91         if (f->header)
92                 munmap(f->header, PAGE_ALIGN(sizeof(Header)));
93
94         if (f->hash_table_window)
95                 munmap(f->hash_table_window, f->hash_table_window_size);
96
97         if (f->bisect_table_window)
98                 munmap(f->bisect_table_window, f->bisect_table_window_size);
99
100         if (f->window)
101                 munmap(f->window, f->window_size);
102
103         free(f->path);
104         free(f);
105 }
106
107 static int journal_file_init_header(JournalFile *f) {
108         Header h;
109         ssize_t k;
110         int r;
111
112         assert(f);
113
114         zero(h);
115         memcpy(h.signature, signature, 8);
116         h.arena_offset = htole64(ALIGN64(sizeof(h)));
117         h.arena_max_size = htole64(DEFAULT_ARENA_MAX_SIZE);
118         h.arena_min_size = htole64(DEFAULT_ARENA_MIN_SIZE);
119         h.arena_keep_free = htole64(DEFAULT_ARENA_KEEP_FREE);
120
121         r = sd_id128_randomize(&h.file_id);
122         if (r < 0)
123                 return r;
124
125         k = pwrite(f->fd, &h, sizeof(h), 0);
126         if (k < 0)
127                 return -errno;
128
129         if (k != sizeof(h))
130                 return -EIO;
131
132         return 0;
133 }
134
135 static int journal_file_refresh_header(JournalFile *f) {
136         int r;
137
138         assert(f);
139
140         r = sd_id128_get_machine(&f->header->machine_id);
141         if (r < 0)
142                 return r;
143
144         r = sd_id128_get_boot(&f->header->boot_id);
145         if (r < 0)
146                 return r;
147
148         f->header->state = htole32(STATE_ONLINE);
149         return 0;
150 }
151
152 static int journal_file_verify_header(JournalFile *f) {
153         assert(f);
154
155         if (memcmp(f->header, signature, 8))
156                 return -EBADMSG;
157
158         if (f->header->incompatible_flags != 0)
159                 return -EPROTONOSUPPORT;
160
161         if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
162                 return -ENODATA;
163
164         if (f->writable) {
165                 uint32_t state;
166                 sd_id128_t machine_id;
167                 int r;
168
169                 r = sd_id128_get_machine(&machine_id);
170                 if (r < 0)
171                         return r;
172
173                 if (!sd_id128_equal(machine_id, f->header->machine_id))
174                         return -EHOSTDOWN;
175
176                 state = le32toh(f->header->state);
177
178                 if (state == STATE_ONLINE)
179                         log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180                 else if (state == STATE_ARCHIVED)
181                         return -ESHUTDOWN;
182                 else if (state != STATE_OFFLINE)
183                         log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
184         }
185
186         return 0;
187 }
188
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190         uint64_t asize;
191         uint64_t old_size, new_size;
192
193         assert(f);
194
195         if (offset < le64toh(f->header->arena_offset))
196                 return -EINVAL;
197
198         new_size = PAGE_ALIGN(offset + size);
199
200         /* We assume that this file is not sparse, and we know that
201          * for sure, since we alway call posix_fallocate()
202          * ourselves */
203
204         old_size =
205                 le64toh(f->header->arena_offset) +
206                 le64toh(f->header->arena_size);
207
208         if (old_size >= new_size)
209                 return 0;
210
211         asize = new_size - le64toh(f->header->arena_offset);
212
213         if (asize > le64toh(f->header->arena_min_size)) {
214                 struct statvfs svfs;
215
216                 if (fstatvfs(f->fd, &svfs) >= 0) {
217                         uint64_t available;
218
219                         available = svfs.f_bfree * svfs.f_bsize;
220
221                         if (available >= f->header->arena_keep_free)
222                                 available -= f->header->arena_keep_free;
223                         else
224                                 available = 0;
225
226                         if (new_size - old_size > available)
227                                 return -E2BIG;
228                 }
229         }
230
231         if (asize > le64toh(f->header->arena_max_size))
232                 return -E2BIG;
233
234         if (posix_fallocate(f->fd, 0, new_size) < 0)
235                 return -errno;
236
237         if (fstat(f->fd, &f->last_stat) < 0)
238                 return -errno;
239
240         f->header->arena_size = htole64(asize);
241
242         return 0;
243 }
244
245 static int journal_file_map(
246                 JournalFile *f,
247                 uint64_t offset,
248                 uint64_t size,
249                 void **_window,
250                 uint64_t *_woffset,
251                 uint64_t *_wsize,
252                 void **ret) {
253
254         uint64_t woffset, wsize;
255         void *window;
256
257         assert(f);
258         assert(size > 0);
259         assert(ret);
260
261         woffset = offset & ~((uint64_t) page_size() - 1ULL);
262         wsize = size + (offset - woffset);
263         wsize = PAGE_ALIGN(wsize);
264
265         window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
266         if (window == MAP_FAILED)
267                 return -errno;
268
269         if (_window)
270                 *_window = window;
271
272         if (_woffset)
273                 *_woffset = woffset;
274
275         if (_wsize)
276                 *_wsize = wsize;
277
278         *ret = (uint8_t*) window + (offset - woffset);
279
280         return 0;
281 }
282
283 static int journal_file_move_to(JournalFile *f, uint64_t offset, uint64_t size, void **ret) {
284         void *p;
285         uint64_t delta;
286         int r;
287
288         assert(f);
289         assert(ret);
290
291         if (_likely_(f->window &&
292                      f->window_offset <= offset &&
293                      f->window_offset+f->window_size >= offset + size)) {
294
295                 *ret = (uint8_t*) f->window + (offset - f->window_offset);
296                 return 0;
297         }
298
299         if (f->window) {
300                 if (munmap(f->window, f->window_size) < 0)
301                         return -errno;
302
303                 f->window = NULL;
304                 f->window_size = f->window_offset = 0;
305         }
306
307         if (size < DEFAULT_WINDOW_SIZE) {
308                 /* If the default window size is larger then what was
309                  * asked for extend the mapping a bit in the hope to
310                  * minimize needed remappings later on. We add half
311                  * the window space before and half behind the
312                  * requested mapping */
313
314                 delta = PAGE_ALIGN((DEFAULT_WINDOW_SIZE - size) / 2);
315
316                 if (offset < delta)
317                         delta = offset;
318
319                 offset -= delta;
320                 size += (DEFAULT_WINDOW_SIZE - delta);
321         } else
322                 delta = 0;
323
324         r = journal_file_map(f,
325                              offset, size,
326                              &f->window, &f->window_offset, &f->window_size,
327                              & p);
328
329         if (r < 0)
330                 return r;
331
332         *ret = (uint8_t*) p + delta;
333         return 0;
334 }
335
336 static bool verify_hash(Object *o) {
337         uint64_t t;
338
339         assert(o);
340
341         t = le64toh(o->object.type);
342         if (t == OBJECT_DATA) {
343                 uint64_t s, h1, h2;
344
345                 s = le64toh(o->object.size);
346
347                 h1 = le64toh(o->data.hash);
348                 h2 = hash64(o->data.payload, s - offsetof(Object, data.payload));
349
350                 return h1 == h2;
351         }
352
353         return true;
354 }
355
356 int journal_file_move_to_object(JournalFile *f, uint64_t offset, Object **ret) {
357         int r;
358         void *t;
359         Object *o;
360         uint64_t s;
361
362         assert(f);
363         assert(ret);
364
365         r = journal_file_move_to(f, offset, sizeof(ObjectHeader), &t);
366         if (r < 0)
367                 return r;
368
369         o = (Object*) t;
370         s = le64toh(o->object.size);
371
372         if (s < sizeof(ObjectHeader))
373                 return -EBADMSG;
374
375         if (s > sizeof(ObjectHeader)) {
376                 r = journal_file_move_to(f, offset, s, &t);
377                 if (r < 0)
378                         return r;
379
380                 o = (Object*) t;
381         }
382
383         if (!verify_hash(o))
384                 return -EBADMSG;
385
386         *ret = o;
387         return 0;
388 }
389
390 static uint64_t journal_file_seqnum(JournalFile *f) {
391         uint64_t r;
392
393         assert(f);
394
395         r = le64toh(f->header->seqnum) + 1;
396         f->header->seqnum = htole64(r);
397
398         return r;
399 }
400
401 static int journal_file_append_object(JournalFile *f, uint64_t size, Object **ret, uint64_t *offset) {
402         int r;
403         uint64_t p;
404         Object *tail, *o;
405         void *t;
406
407         assert(f);
408         assert(size >= sizeof(ObjectHeader));
409         assert(offset);
410         assert(ret);
411
412         p = le64toh(f->header->tail_object_offset);
413
414         if (p == 0)
415                 p = le64toh(f->header->arena_offset);
416         else {
417                 r = journal_file_move_to_object(f, p, &tail);
418                 if (r < 0)
419                         return r;
420
421                 p += ALIGN64(le64toh(tail->object.size));
422         }
423
424         r = journal_file_allocate(f, p, size);
425         if (r < 0)
426                 return r;
427
428         r = journal_file_move_to(f, p, size, &t);
429         if (r < 0)
430                 return r;
431
432         o = (Object*) t;
433
434         zero(o->object);
435         o->object.type = htole64(OBJECT_UNUSED);
436         zero(o->object.reserved);
437         o->object.size = htole64(size);
438
439         f->header->tail_object_offset = htole64(p);
440         if (f->header->head_object_offset == 0)
441                 f->header->head_object_offset = htole64(p);
442
443         f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
444
445         *ret = o;
446         *offset = p;
447
448         return 0;
449 }
450
451 static int journal_file_setup_hash_table(JournalFile *f) {
452         uint64_t s, p;
453         Object *o;
454         int r;
455
456         assert(f);
457
458         s = DEFAULT_HASH_TABLE_SIZE;
459         r = journal_file_append_object(f, offsetof(Object, hash_table.table) + s, &o, &p);
460         if (r < 0)
461                 return r;
462
463         o->object.type = htole64(OBJECT_HASH_TABLE);
464         memset(o->hash_table.table, 0, s);
465
466         f->header->hash_table_offset = htole64(p + offsetof(Object, hash_table.table));
467         f->header->hash_table_size = htole64(s);
468
469         return 0;
470 }
471
472 static int journal_file_setup_bisect_table(JournalFile *f) {
473         uint64_t s, p;
474         Object *o;
475         int r;
476
477         assert(f);
478
479         s = DEFAULT_BISECT_TABLE_SIZE;
480         r = journal_file_append_object(f, offsetof(Object, bisect_table.table) + s, &o, &p);
481         if (r < 0)
482                 return r;
483
484         o->object.type = htole64(OBJECT_BISECT_TABLE);
485         memset(o->bisect_table.table, 0, s);
486
487         f->header->bisect_table_offset = htole64(p + offsetof(Object, bisect_table.table));
488         f->header->bisect_table_size = htole64(s);
489
490         return 0;
491 }
492
493 static int journal_file_map_hash_table(JournalFile *f) {
494         uint64_t s, p;
495         void *t;
496         int r;
497
498         assert(f);
499
500         p = le64toh(f->header->hash_table_offset);
501         s = le64toh(f->header->hash_table_size);
502
503         r = journal_file_map(f,
504                              p, s,
505                              &f->hash_table_window, NULL, &f->hash_table_window_size,
506                              &t);
507         if (r < 0)
508                 return r;
509
510         f->hash_table = t;
511         return 0;
512 }
513
514 static int journal_file_map_bisect_table(JournalFile *f) {
515         uint64_t s, p;
516         void *t;
517         int r;
518
519         assert(f);
520
521         p = le64toh(f->header->bisect_table_offset);
522         s = le64toh(f->header->bisect_table_size);
523
524         r = journal_file_map(f,
525                              p, s,
526                              &f->bisect_table_window, NULL, &f->bisect_table_window_size,
527                              &t);
528
529         if (r < 0)
530                 return r;
531
532         f->bisect_table = t;
533         return 0;
534 }
535
536 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash_index) {
537         uint64_t p;
538         int r;
539
540         assert(f);
541         assert(o);
542         assert(offset > 0);
543         assert(o->object.type == htole64(OBJECT_DATA));
544
545         o->data.head_entry_offset = o->data.tail_entry_offset = 0;
546         o->data.next_hash_offset = 0;
547
548         p = le64toh(f->hash_table[hash_index].tail_hash_offset);
549         if (p == 0) {
550                 /* Only entry in the hash table is easy */
551
552                 o->data.prev_hash_offset = 0;
553                 f->hash_table[hash_index].head_hash_offset = htole64(offset);
554         } else {
555                 o->data.prev_hash_offset = htole64(p);
556
557                 /* Temporarily move back to the previous data object,
558                  * to patch in pointer */
559
560                 r = journal_file_move_to_object(f, p, &o);
561                 if (r < 0)
562                         return r;
563
564                 o->data.next_hash_offset = offset;
565
566                 r = journal_file_move_to_object(f, offset, &o);
567                 if (r < 0)
568                         return r;
569         }
570
571         f->hash_table[hash_index].tail_hash_offset = htole64(offset);
572
573         return 0;
574 }
575
576 static int journal_file_append_data(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
577         uint64_t hash, h, p, np;
578         uint64_t osize;
579         Object *o;
580         int r;
581
582         assert(f);
583         assert(data || size == 0);
584
585         osize = offsetof(Object, data.payload) + size;
586
587         hash = hash64(data, size);
588         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
589         p = le64toh(f->hash_table[h].head_hash_offset);
590
591         while (p != 0) {
592                 /* Look for this data object in the hash table */
593
594                 r = journal_file_move_to_object(f, p, &o);
595                 if (r < 0)
596                         return r;
597
598                 if (le64toh(o->object.type) != OBJECT_DATA)
599                         return -EBADMSG;
600
601                 if (le64toh(o->object.size) == osize &&
602                     memcmp(o->data.payload, data, size) == 0) {
603
604                         if (le64toh(o->data.hash) != hash)
605                                 return -EBADMSG;
606
607                         if (ret)
608                                 *ret = o;
609
610                         if (offset)
611                                 *offset = p;
612
613                         return 0;
614                 }
615
616                 p = le64toh(o->data.next_hash_offset);
617         }
618
619         r = journal_file_append_object(f, osize, &o, &np);
620         if (r < 0)
621                 return r;
622
623         o->object.type = htole64(OBJECT_DATA);
624         o->data.hash = htole64(hash);
625         memcpy(o->data.payload, data, size);
626
627         r = journal_file_link_data(f, o, np, h);
628         if (r < 0)
629                 return r;
630
631         if (ret)
632                 *ret = o;
633
634         if (offset)
635                 *offset = np;
636
637         return 0;
638 }
639
640 uint64_t journal_file_entry_n_items(Object *o) {
641         assert(o);
642         assert(o->object.type == htole64(OBJECT_ENTRY));
643
644         return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
645 }
646
647 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
648         uint64_t p, q;
649         int r;
650         assert(f);
651         assert(o);
652         assert(offset > 0);
653
654         p = le64toh(o->entry.items[i].object_offset);
655         if (p == 0)
656                 return -EINVAL;
657
658         o->entry.items[i].next_entry_offset = 0;
659
660         /* Move to the data object */
661         r = journal_file_move_to_object(f, p, &o);
662         if (r < 0)
663                 return r;
664
665         if (o->object.type != htole64(OBJECT_DATA))
666                 return -EBADMSG;
667
668         q = le64toh(o->data.tail_entry_offset);
669         o->data.tail_entry_offset = htole64(offset);
670
671         if (q == 0)
672                 o->data.head_entry_offset = htole64(offset);
673         else {
674                 uint64_t n, j;
675
676                 /* Move to previous entry */
677                 r = journal_file_move_to_object(f, q, &o);
678                 if (r < 0)
679                         return r;
680
681                 if (o->object.type != htole64(OBJECT_ENTRY))
682                         return -EBADMSG;
683
684                 n = journal_file_entry_n_items(o);
685                 for (j = 0; j < n; j++)
686                         if (le64toh(o->entry.items[j].object_offset) == p)
687                                 break;
688
689                 if (j >= n)
690                         return -EBADMSG;
691
692                 o->entry.items[j].next_entry_offset = offset;
693         }
694
695         /* Move back to original entry */
696         r = journal_file_move_to_object(f, offset, &o);
697         if (r < 0)
698                 return r;
699
700         o->entry.items[i].prev_entry_offset = q;
701         return 0;
702 }
703
704 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
705         uint64_t p, i, n, k, a, b;
706         int r;
707
708         assert(f);
709         assert(o);
710         assert(offset > 0);
711         assert(o->object.type == htole64(OBJECT_ENTRY));
712
713         /* Link up the entry itself */
714         p = le64toh(f->header->tail_entry_offset);
715
716         o->entry.prev_entry_offset = f->header->tail_entry_offset;
717         o->entry.next_entry_offset = 0;
718
719         if (p == 0)
720                 f->header->head_entry_offset = htole64(offset);
721         else {
722                 /* Temporarily move back to the previous entry, to
723                  * patch in pointer */
724
725                 r = journal_file_move_to_object(f, p, &o);
726                 if (r < 0)
727                         return r;
728
729                 o->entry.next_entry_offset = htole64(offset);
730
731                 r = journal_file_move_to_object(f, offset, &o);
732                 if (r < 0)
733                         return r;
734         }
735
736         f->header->tail_entry_offset = htole64(offset);
737
738         /* Link up the items */
739         n = journal_file_entry_n_items(o);
740         for (i = 0; i < n; i++) {
741                 r = journal_file_link_entry_item(f, o, offset, i);
742                 if (r < 0)
743                         return r;
744         }
745
746         /* Link up the entry in the bisect table */
747         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
748         k = le64toh(f->header->arena_max_size) / n;
749
750         a = (le64toh(f->header->last_bisect_offset) + k - 1) / k;
751         b = offset / k;
752
753         for (; a <= b; a++)
754                 f->bisect_table[a] = htole64(offset);
755
756         f->header->last_bisect_offset = htole64(offset + le64toh(o->object.size));
757
758         return 0;
759 }
760
761 static int journal_file_append_entry_internal(JournalFile *f, const dual_timestamp *ts, const EntryItem items[], unsigned n_items, Object **ret, uint64_t *offset) {
762         uint64_t np;
763         uint64_t osize;
764         Object *o;
765         int r;
766
767         assert(f);
768         assert(items || n_items == 0);
769
770         osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
771
772         r = journal_file_append_object(f, osize, &o, &np);
773         if (r < 0)
774                 return r;
775
776         o->object.type = htole64(OBJECT_ENTRY);
777         o->entry.seqnum = htole64(journal_file_seqnum(f));
778         memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
779         o->entry.realtime = htole64(ts->realtime);
780         o->entry.monotonic = htole64(ts->monotonic);
781
782         r = journal_file_link_entry(f, o, np);
783         if (r < 0)
784                 return r;
785
786         if (ret)
787                 *ret = o;
788
789         if (offset)
790                 *offset = np;
791
792         return 0;
793 }
794
795 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, Object **ret, uint64_t *offset) {
796         unsigned i;
797         EntryItem *items;
798         int r;
799
800         assert(f);
801
802         items = new(EntryItem, n_iovec);
803         if (!items)
804                 return -ENOMEM;
805
806         for (i = 0; i < n_iovec; i++) {
807                 uint64_t p;
808
809                 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, NULL, &p);
810                 if (r < 0)
811                         goto finish;
812
813                 items[i].object_offset = htole64(p);
814         }
815
816         r = journal_file_append_entry_internal(f, ts, items, n_iovec, ret, offset);
817
818 finish:
819         free(items);
820
821         return r;
822 }
823
824 int journal_file_move_to_entry(JournalFile *f, uint64_t seqnum, Object **ret, uint64_t *offset) {
825         Object *o;
826         uint64_t lower, upper, p, n, k;
827         int r;
828
829         assert(f);
830
831         n = le64toh(f->header->bisect_table_size) / sizeof(uint64_t);
832         k = le64toh(f->header->arena_max_size) / n;
833
834         lower = 0;
835         upper = le64toh(f->header->last_bisect_offset)/k+1;
836
837         while (lower < upper) {
838                 k = (upper + lower) / 2;
839                 p = le64toh(f->bisect_table[k]);
840
841                 if (p == 0) {
842                         upper = k;
843                         continue;
844                 }
845
846                 r = journal_file_move_to_object(f, p, &o);
847                 if (r < 0)
848                         return r;
849
850                 if (o->object.type != htole64(OBJECT_ENTRY))
851                         return -EBADMSG;
852
853                 if (o->entry.seqnum == seqnum) {
854                         if (ret)
855                                 *ret = o;
856
857                         if (offset)
858                                 *offset = p;
859
860                         return 1;
861                 } else if (seqnum < o->entry.seqnum)
862                         upper = k;
863                 else if (seqnum > o->entry.seqnum)
864                         lower = k+1;
865         }
866
867         assert(lower == upper);
868
869         if (lower <= 0)
870                 return 0;
871
872         /* The object we are looking for is between
873          * bisect_table[lower-1] and bisect_table[lower] */
874
875         p = le64toh(f->bisect_table[lower-1]);
876
877         for (;;) {
878                 r = journal_file_move_to_object(f, p, &o);
879                 if (r < 0)
880                         return r;
881
882                 if (o->entry.seqnum == seqnum) {
883                         if (ret)
884                                 *ret = o;
885
886                         if (offset)
887                                 *offset = p;
888
889                         return 1;
890
891                 } if (seqnum < o->entry.seqnum)
892                         return 0;
893
894                 if (o->entry.next_entry_offset == 0)
895                         return 0;
896
897                 p = le64toh(o->entry.next_entry_offset);
898         }
899
900         return 0;
901 }
902
903 int journal_file_next_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
904         uint64_t np;
905         int r;
906
907         assert(f);
908
909         if (!o)
910                 np = le64toh(f->header->head_entry_offset);
911         else {
912                 if (le64toh(o->object.type) != OBJECT_ENTRY)
913                         return -EINVAL;
914
915                 np = le64toh(o->entry.next_entry_offset);
916         }
917
918         if (np == 0)
919                 return 0;
920
921         r = journal_file_move_to_object(f, np, &o);
922         if (r < 0)
923                 return r;
924
925         if (le64toh(o->object.type) != OBJECT_ENTRY)
926                 return -EBADMSG;
927
928         if (ret)
929                 *ret = o;
930
931         if (offset)
932                 *offset = np;
933
934         return 1;
935 }
936
937 int journal_file_prev_entry(JournalFile *f, Object *o, Object **ret, uint64_t *offset) {
938         uint64_t np;
939         int r;
940
941         assert(f);
942
943         if (!o)
944                 np = le64toh(f->header->tail_entry_offset);
945         else {
946                 if (le64toh(o->object.type) != OBJECT_ENTRY)
947                         return -EINVAL;
948
949                 np = le64toh(o->entry.prev_entry_offset);
950         }
951
952         if (np == 0)
953                 return 0;
954
955         r = journal_file_move_to_object(f, np, &o);
956         if (r < 0)
957                 return r;
958
959         if (le64toh(o->object.type) != OBJECT_ENTRY)
960                 return -EBADMSG;
961
962         if (ret)
963                 *ret = o;
964
965         if (offset)
966                 *offset = np;
967
968         return 1;
969 }
970
971 int journal_file_find_first_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
972         uint64_t p, osize, hash, h;
973         int r;
974
975         assert(f);
976         assert(data || size == 0);
977
978         osize = offsetof(Object, data.payload) + size;
979
980         hash = hash64(data, size);
981         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
982         p = le64toh(f->hash_table[h].head_hash_offset);
983
984         while (p != 0) {
985                 Object *o;
986
987                 r = journal_file_move_to_object(f, p, &o);
988                 if (r < 0)
989                         return r;
990
991                 if (le64toh(o->object.type) != OBJECT_DATA)
992                         return -EBADMSG;
993
994                 if (le64toh(o->object.size) == osize &&
995                     memcmp(o->data.payload, data, size) == 0) {
996
997                         if (le64toh(o->data.hash) != hash)
998                                 return -EBADMSG;
999
1000                         if (o->data.head_entry_offset == 0)
1001                                 return 0;
1002
1003                         p = le64toh(o->data.head_entry_offset);
1004                         r = journal_file_move_to_object(f, p, &o);
1005                         if (r < 0)
1006                                 return r;
1007
1008                         if (le64toh(o->object.type) != OBJECT_ENTRY)
1009                                 return -EBADMSG;
1010
1011                         if (ret)
1012                                 *ret = o;
1013
1014                         if (offset)
1015                                 *offset = p;
1016
1017                         return 1;
1018                 }
1019
1020                 p = le64toh(o->data.next_hash_offset);
1021         }
1022
1023         return 0;
1024 }
1025
1026 int journal_file_find_last_entry(JournalFile *f, const void *data, uint64_t size, Object **ret, uint64_t *offset) {
1027         uint64_t p, osize, hash, h;
1028         int r;
1029
1030         assert(f);
1031         assert(data || size == 0);
1032
1033         osize = offsetof(Object, data.payload) + size;
1034
1035         hash = hash64(data, size);
1036         h = hash % (le64toh(f->header->hash_table_size) / sizeof(HashItem));
1037         p = le64toh(f->hash_table[h].tail_hash_offset);
1038
1039         while (p != 0) {
1040                 Object *o;
1041
1042                 r = journal_file_move_to_object(f, p, &o);
1043                 if (r < 0)
1044                         return r;
1045
1046                 if (le64toh(o->object.type) != OBJECT_DATA)
1047                         return -EBADMSG;
1048
1049                 if (le64toh(o->object.size) == osize &&
1050                     memcmp(o->data.payload, data, size) == 0) {
1051
1052                         if (le64toh(o->data.hash) != hash)
1053                                 return -EBADMSG;
1054
1055                         if (o->data.tail_entry_offset == 0)
1056                                 return 0;
1057
1058                         p = le64toh(o->data.tail_entry_offset);
1059                         r = journal_file_move_to_object(f, p, &o);
1060                         if (r < 0)
1061                                 return r;
1062
1063                         if (le64toh(o->object.type) != OBJECT_ENTRY)
1064                                 return -EBADMSG;
1065
1066                         if (ret)
1067                                 *ret = o;
1068
1069                         if (offset)
1070                                 *offset = p;
1071
1072                         return 1;
1073                 }
1074
1075                 p = le64toh(o->data.prev_hash_offset);
1076         }
1077
1078         return 0;
1079 }
1080
1081 void journal_file_dump(JournalFile *f) {
1082         char a[33], b[33], c[33];
1083         Object *o;
1084         int r;
1085         uint64_t p;
1086
1087         assert(f);
1088
1089         printf("File ID: %s\n"
1090                "Machine ID: %s\n"
1091                "Boot ID: %s\n"
1092                "Arena size: %llu\n",
1093                sd_id128_to_string(f->header->file_id, a),
1094                sd_id128_to_string(f->header->machine_id, b),
1095                sd_id128_to_string(f->header->boot_id, c),
1096                (unsigned long long) le64toh(f->header->arena_size));
1097
1098         p = le64toh(f->header->head_object_offset);
1099         while (p != 0) {
1100                 r = journal_file_move_to_object(f, p, &o);
1101                 if (r < 0)
1102                         goto fail;
1103
1104                 switch (o->object.type) {
1105
1106                 case OBJECT_UNUSED:
1107                         printf("Type: OBJECT_UNUSED\n");
1108                         break;
1109
1110                 case OBJECT_DATA:
1111                         printf("Type: OBJECT_DATA\n");
1112                         break;
1113
1114                 case OBJECT_ENTRY:
1115                         printf("Type: OBJECT_ENTRY %llu\n", (unsigned long long) le64toh(o->entry.seqnum));
1116                         break;
1117
1118                 case OBJECT_HASH_TABLE:
1119                         printf("Type: OBJECT_HASH_TABLE\n");
1120                         break;
1121
1122                 case OBJECT_BISECT_TABLE:
1123                         printf("Type: OBJECT_BISECT_TABLE\n");
1124                         break;
1125                 }
1126
1127                 if (p == le64toh(f->header->tail_object_offset))
1128                         p = 0;
1129                 else
1130                         p = p + ALIGN64(le64toh(o->object.size));
1131         }
1132
1133         return;
1134 fail:
1135         log_error("File corrupt");
1136 }
1137
1138 int journal_file_open(
1139                 sd_journal *j,
1140                 const char *fname,
1141                 int flags,
1142                 mode_t mode,
1143                 JournalFile **ret) {
1144
1145         JournalFile *f;
1146         int r;
1147         bool newly_created = false;
1148
1149         assert(fname);
1150
1151         if ((flags & O_ACCMODE) != O_RDONLY &&
1152             (flags & O_ACCMODE) != O_RDWR)
1153                 return -EINVAL;
1154
1155         f = new0(JournalFile, 1);
1156         if (!f)
1157                 return -ENOMEM;
1158
1159         f->writable = (flags & O_ACCMODE) != O_RDONLY;
1160         f->prot = prot_from_flags(flags);
1161
1162         f->fd = open(fname, flags|O_CLOEXEC, mode);
1163         if (f->fd < 0) {
1164                 r = -errno;
1165                 goto fail;
1166         }
1167
1168         f->path = strdup(fname);
1169         if (!f->path) {
1170                 r = -ENOMEM;
1171                 goto fail;
1172         }
1173
1174         if (fstat(f->fd, &f->last_stat) < 0) {
1175                 r = -errno;
1176                 goto fail;
1177         }
1178
1179         if (f->last_stat.st_size == 0 && f->writable) {
1180                 newly_created = true;
1181
1182                 r = journal_file_init_header(f);
1183                 if (r < 0)
1184                         goto fail;
1185
1186                 if (fstat(f->fd, &f->last_stat) < 0) {
1187                         r = -errno;
1188                         goto fail;
1189                 }
1190         }
1191
1192         if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1193                 r = -EIO;
1194                 goto fail;
1195         }
1196
1197         f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1198         if (f->header == MAP_FAILED) {
1199                 f->header = NULL;
1200                 r = -errno;
1201                 goto fail;
1202         }
1203
1204         if (!newly_created) {
1205                 r = journal_file_verify_header(f);
1206                 if (r < 0)
1207                         goto fail;
1208         }
1209
1210         if (f->writable) {
1211                 r = journal_file_refresh_header(f);
1212                 if (r < 0)
1213                         goto fail;
1214         }
1215
1216         if (newly_created) {
1217
1218                 r = journal_file_setup_hash_table(f);
1219                 if (r < 0)
1220                         goto fail;
1221
1222                 r = journal_file_setup_bisect_table(f);
1223                 if (r < 0)
1224                         goto fail;
1225         }
1226
1227         r = journal_file_map_hash_table(f);
1228         if (r < 0)
1229                 goto fail;
1230
1231         r = journal_file_map_bisect_table(f);
1232         if (r < 0)
1233                 goto fail;
1234
1235         if (j) {
1236                 LIST_PREPEND(JournalFile, files, j->files, f);
1237                 f->journal = j;
1238         }
1239
1240         if (ret)
1241                 *ret = f;
1242
1243         return 0;
1244
1245 fail:
1246         journal_file_close(f);
1247
1248         return r;
1249 }
1250
1251 int sd_journal_open(sd_journal **ret) {
1252         sd_journal *j;
1253         char *fn;
1254         const char *p;
1255         int r = 0;
1256         const char search_paths[] =
1257                 "/run/log/journal\0"
1258                 "/var/log/journal\0";
1259
1260         assert(ret);
1261
1262         j = new0(sd_journal, 1);
1263         if (!j)
1264                 return -ENOMEM;
1265
1266         NULSTR_FOREACH(p, search_paths) {
1267                 DIR *d;
1268
1269                 d = opendir(p);
1270                 if (!d) {
1271                         if (errno != ENOENT && r == 0)
1272                                 r = -errno;
1273
1274                         continue;
1275                 }
1276
1277                 for (;;) {
1278                         struct dirent buf, *de;
1279                         int k;
1280
1281                         k = readdir_r(d, &buf, &de);
1282                         if (k != 0) {
1283                                 if (r == 0)
1284                                         r = -k;
1285
1286                                 break;
1287                         }
1288
1289                         if (!de)
1290                                 break;
1291
1292                         if (!dirent_is_file_with_suffix(de, ".journal"))
1293                                 continue;
1294
1295                         fn = join(p, "/", de->d_name, NULL);
1296                         if (!fn) {
1297                                 r = -ENOMEM;
1298                                 closedir(d);
1299                                 goto fail;
1300                         }
1301
1302                         k = journal_file_open(j, fn, O_RDONLY, 0, NULL);
1303                         if (k < 0 && r == 0)
1304                                 r = -k;
1305
1306                         free(fn);
1307                 }
1308         }
1309
1310         if (!j->files) {
1311                 if (r >= 0)
1312                         r = -ENOENT;
1313
1314                 goto fail;
1315         }
1316
1317         *ret = j;
1318         return 0;
1319
1320 fail:
1321         sd_journal_close(j);
1322
1323         return r;
1324 };
1325
1326 void sd_journal_close(sd_journal *j) {
1327         assert(j);
1328
1329         while (j->files)
1330                 journal_file_close(j->files);
1331
1332         free(j);
1333 }