1 /* $Id: tdx-data.c 7598 2007-02-09 02:40:51Z eagle $
3 ** Overview data file handling for the tradindexed overview method.
5 ** Implements the handling of the .IDX and .DAT files for the tradindexed
6 ** overview method. The .IDX files are flat arrays of binary structs
7 ** specifying the offset in the data file of the overview data for a given
8 ** article as well as the length of that data and some additional meta-data
9 ** about that article. The .DAT files contain all of the overview data for
10 ** that group in wire format.
12 ** Externally visible functions have a tdx_ prefix; internal functions do
13 ** not. (Externally visible unfortunately means everything that needs to be
14 ** visible outside of this object file, not just interfaces exported to
15 ** consumers of the overview API.)
20 #include "portable/mmap.h"
25 #include "inn/history.h"
26 #include "inn/innconf.h"
27 #include "inn/messages.h"
31 #include "ovinterface.h"
33 #include "tdx-private.h"
34 #include "tdx-structure.h"
36 /* Returned to callers as an opaque data type, this holds the information
37 needed to manage a search in progress. */
41 struct group_data *data;
44 /* Internal prototypes. */
45 static char *group_path(const char *group);
46 static int file_open(const char *base, const char *suffix, bool writable,
48 static bool file_open_index(struct group_data *, const char *suffix);
49 static bool file_open_data(struct group_data *, const char *suffix);
50 static void *map_file(int fd, size_t length, const char *base,
52 static bool map_index(struct group_data *data);
53 static bool map_data(struct group_data *data);
54 static void unmap_index(struct group_data *data);
55 static void unmap_data(struct group_data *data);
56 static ARTNUM index_base(ARTNUM artnum);
60 ** Determine the path to the data files for a particular group and return
61 ** it. Allocates memory which the caller is responsible for freeing.
64 group_path(const char *group)
70 /* The path of the data files for news.groups is dir/n/g/news.groups. In
71 other words, the first letter of each component becomes a directory.
72 The length of the path is therefore the length of the base overview
73 directory path, one character for the slash, two characters for the
74 first letter and initial slash, two characters for each hierarchical
75 level of the group, and then the length of the group name.
77 For robustness, we want to handle leading or multiple consecutive
78 periods. We only recognize a new hierarchical level after a string of
79 periods (which doesn't end the group name). */
80 length = strlen(innconf->pathoverview);
81 for (gp = group; *gp != '\0'; gp++)
83 if (gp[1] == '.' || gp[0] == '\0')
87 length += 1 + 2 + strlen(group) + 1;
88 path = xmalloc(length);
89 strlcpy(path, innconf->pathoverview, length);
90 p = path + strlen(innconf->pathoverview);
92 /* Generate the hierarchical directories. */
93 if (*group != '.' && *group != '\0') {
97 for (gp = strchr(group, '.'); gp != NULL; gp = strchr(gp, '.')) {
101 if (*gp != '\0' && *gp != '.' && *gp != '/') {
108 /* Finally, append the group name to the generated path and then replace
109 all slashes with commas. Commas have the advantage of being clearly
110 illegal in newsgroup names because of the syntax of the Newsgroups
111 header, but aren't shell metacharacters. */
112 strlcpy(p, group, length - (p - path));
113 for (; *p != '\0'; p++)
121 ** Open a data file for a group. Takes the base portion of the file, the
122 ** suffix, a bool saying whether or not the file is being opened for write,
123 ** and a bool saying whether to open it for append. Returns the file
127 file_open(const char *base, const char *suffix, bool writable, bool append)
132 file = concat(base, ".", suffix, (char *) 0);
133 flags = writable ? (O_RDWR | O_CREAT) : O_RDONLY;
136 fd = open(file, flags, ARTFILE_MODE);
137 if (fd < 0 && writable && errno == ENOENT) {
138 char *p = strrchr(file, '/');
141 if (!MakeDirectory(file, true)) {
142 syswarn("tradindexed: cannot create directory %s", file);
147 fd = open(file, flags, ARTFILE_MODE);
151 syswarn("tradindexed: cannot open %s", file);
161 ** Open the index file for a group. Takes an optional suffix to use instead
162 ** of IDX (used primarily for expiring).
165 file_open_index(struct group_data *data, const char *suffix)
171 if (data->indexfd >= 0)
172 close(data->indexfd);
173 data->indexfd = file_open(data->path, suffix, data->writable, false);
174 if (data->indexfd < 0)
176 if (fstat(data->indexfd, &st) < 0) {
177 syswarn("tradindexed: cannot stat %s.%s", data->path, suffix);
178 close(data->indexfd);
181 data->indexinode = st.st_ino;
182 close_on_exec(data->indexfd, true);
188 ** Open the data file for a group. Takes an optional suffix to use instead
189 ** of DAT (used primarily for expiring).
192 file_open_data(struct group_data *data, const char *suffix)
196 if (data->datafd >= 0)
198 data->datafd = file_open(data->path, suffix, data->writable, true);
199 if (data->datafd < 0)
201 close_on_exec(data->datafd, true);
207 ** Open a particular group. Allocates a new struct group_data that should be
208 ** passed to tdx_data_close() when the caller is done with it.
211 tdx_data_new(const char *group, bool writable)
213 struct group_data *data;
215 data = xmalloc(sizeof(struct group_data));
216 data->path = group_path(group);
217 data->writable = writable;
226 data->indexinode = 0;
234 ** Open the index and data files for a group.
237 tdx_data_open_files(struct group_data *data)
243 if (!file_open_index(data, NULL))
245 if (!file_open_data(data, NULL))
250 if (data->indexfd >= 0)
251 close(data->indexfd);
252 if (data->datafd >= 0)
259 ** Map a data file (either index or data), or read in all of the data in the
260 ** file if we're avoiding mmap. Takes the base and suffix of the file for
264 map_file(int fd, size_t length, const char *base, const char *suffix)
271 if (!innconf->tradindexedmmap) {
274 data = xmalloc(length);
275 status = read(fd, data, length);
276 if ((size_t) status != length) {
277 syswarn("tradindexed: cannot read data file %s.%s", base, suffix);
282 data = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
283 if (data == MAP_FAILED) {
284 syswarn("tradindexed: cannot mmap %s.%s", base, suffix);
293 ** Memory map the index file.
296 map_index(struct group_data *data)
301 r = fstat(data->indexfd, &st);
303 if (errno == ESTALE) {
304 r = file_open_index(data, NULL);
306 syswarn("tradindexed: cannot stat %s.IDX", data->path);
311 data->indexlen = st.st_size;
312 data->index = map_file(data->indexfd, data->indexlen, data->path, "IDX");
313 return (data->index == NULL && data->indexlen > 0) ? false : true;
318 ** Memory map the data file.
321 map_data(struct group_data *data)
326 r = fstat(data->datafd, &st);
328 if (errno == ESTALE) {
329 r = file_open_data(data, NULL);
331 syswarn("tradindexed: cannot stat %s.DAT", data->path);
336 data->datalen = st.st_size;
337 data->data = map_file(data->datafd, data->datalen, data->path, "DAT");
338 return (data->data == NULL && data->indexlen > 0) ? false : true;
343 ** Unmap a data file or free the memory copy if we're not using mmap. Takes
344 ** the memory to free or unmap, the length for munmap, and the name base and
345 ** suffix for error reporting.
348 unmap_file(void *data, off_t length, const char *base, const char *suffix)
352 if (!innconf->tradindexedmmap)
355 if (munmap(data, length) < 0)
356 syswarn("tradindexed: cannot munmap %s.%s", base, suffix);
362 ** Unmap the index file.
365 unmap_index(struct group_data *data)
367 unmap_file(data->index, data->indexlen, data->path, "IDX");
373 ** Unmap the data file.
376 unmap_data(struct group_data *data)
378 unmap_file(data->data, data->datalen, data->path, "DAT");
383 ** Determine if the file handle associated with the index table is stale
386 stale_index(struct group_data *data)
391 r = fstat(data->indexfd, &st);
392 return r == -1 && errno == ESTALE;
397 ** Determine if the file handle associated with the data table is stale
400 stale_data(struct group_data *data)
405 r = fstat(data->datafd, &st);
406 return r == -1 && errno == ESTALE;
411 ** Retrieves the article metainformation stored in the index table (all the
412 ** stuff we can return without opening the data file). Takes the article
413 ** number and returns a pointer to the index entry. Also takes the high
414 ** water mark from the group index; this is used to decide whether to attempt
415 ** remapping of the index file if the current high water mark is too low.
417 const struct index_entry *
418 tdx_article_entry(struct group_data *data, ARTNUM article, ARTNUM high)
420 struct index_entry *entry;
423 if (article > data->high && high > data->high) {
427 } else if (innconf->nfsreader && stale_index(data))
429 if (data->index == NULL)
430 if (!map_index(data))
433 if (article < data->base)
435 offset = article - data->base;
436 if (offset >= data->indexlen / sizeof(struct index_entry))
438 entry = data->index + offset;
439 if (entry->length == 0)
446 ** Begin an overview search. In addition to the bounds of the search, we
447 ** also take the high water mark from the group index; this is used to decide
448 ** whether or not to attempt remapping of the index file if the current high
449 ** water mark is too low.
452 tdx_search_open(struct group_data *data, ARTNUM start, ARTNUM end, ARTNUM high)
454 struct search *search;
456 if (end < data->base)
461 if (end > data->high && high > data->high) {
466 if (start > data->high)
469 if (innconf->nfsreader && stale_index(data))
471 if (data->index == NULL)
472 if (!map_index(data))
474 if (innconf->nfsreader && stale_data(data))
476 if (data->data == NULL)
480 search = xmalloc(sizeof(struct search));
481 search->limit = end - data->base;
482 search->current = (start < data->base) ? 0 : start - data->base;
484 search->data->refcount++;
491 ** Return the next record in a search.
494 tdx_search(struct search *search, struct article *artdata)
496 struct index_entry *entry;
499 if (search == NULL || search->data == NULL)
501 if (search->data->index == NULL || search->data->data == NULL)
504 max = (search->data->indexlen / sizeof(struct index_entry)) - 1;
505 entry = search->data->index + search->current;
506 while (search->current <= search->limit && search->current <= max) {
507 if (entry->length != 0)
512 if (search->current > search->limit || search->current > max)
515 /* Make sure that the offset into the data file is sensible, and try
516 remapping the data file if the portion the offset is pointing to isn't
517 currently mapped. Otherwise, warn about possible corruption and return
519 if (entry->offset + entry->length > search->data->datalen) {
520 unmap_data(search->data);
521 if (!map_data(search->data))
524 if (entry->offset + entry->length > search->data->datalen) {
525 warn("Invalid entry for article %lu in %s.IDX: offset %lu length %lu",
526 search->current + search->data->base, search->data->path,
527 (unsigned long) entry->offset, (unsigned long) entry->length);
531 artdata->number = search->current + search->data->base;
532 artdata->overview = search->data->data + entry->offset;
533 artdata->overlen = entry->length;
534 artdata->token = entry->token;
535 artdata->arrived = entry->arrived;
536 artdata->expires = entry->expires;
544 ** End an overview search.
547 tdx_search_close(struct search *search)
549 if (search->data != NULL) {
550 search->data->refcount--;
551 if (search->data->refcount == 0)
552 tdx_data_close(search->data);
559 ** Given an article number, return an index base appropriate for that article
560 ** number. This includes a degree of slop so that we don't have to
561 ** constantly repack if the article numbers are clustered around a particular
562 ** value but don't come in order.
565 index_base(ARTNUM artnum)
567 return (artnum > 128) ? (artnum - 128) : 1;
572 ** Store the data for a single article into the overview files for a group.
573 ** Assumes any necessary repacking has already been done. If the base value
574 ** in the group_data structure is 0, assumes this is the first time we've
575 ** written overview information to this group and sets it appropriately.
578 tdx_data_store(struct group_data *data, const struct article *article)
580 struct index_entry entry;
586 data->base = index_base(article->number);
587 if (data->base > article->number) {
588 warn("tradindexed: cannot add %lu to %s.IDX, base == %lu",
589 article->number, data->path, data->base);
593 /* Write out the data and fill in the index entry. */
594 memset(&entry, 0, sizeof(entry));
595 if (xwrite(data->datafd, article->overview, article->overlen) < 0) {
596 syswarn("tradindexed: cannot append %lu of data for %lu to %s.DAT",
597 (unsigned long) article->overlen, article->number,
601 entry.offset = lseek(data->datafd, 0, SEEK_CUR);
602 if (entry.offset < 0) {
603 syswarn("tradindexed: cannot get offset for article %lu in %s.DAT",
604 article->number, data->path);
607 entry.length = article->overlen;
608 entry.offset -= entry.length;
609 entry.arrived = article->arrived;
610 entry.expires = article->expires;
611 entry.token = article->token;
613 /* Write out the index entry. */
614 offset = (article->number - data->base) * sizeof(struct index_entry);
615 if (xpwrite(data->indexfd, &entry, sizeof(entry), offset) < 0) {
616 syswarn("tradindexed: cannot write index record for %lu in %s.IDX",
617 article->number, data->path);
625 ** Start the process of packing a group (rewriting its index file so that it
626 ** uses a different article base). Takes the article number of an article
627 ** that needs to be written to the index file and is below the current base.
628 ** Returns the true success and false on failure, and sets data->base to the
629 ** new article base and data->indexinode to the new inode number. At the
630 ** conclusion of this routine, the new index file has been created, but it
631 ** has not yet been moved into place; that is done by tdx_data_pack_finish.
634 tdx_data_pack_start(struct group_data *data, ARTNUM artnum)
644 if (data->base <= artnum) {
645 warn("tradindexed: tdx_data_pack_start called unnecessarily");
649 /* Open the new index file. */
650 base = index_base(artnum);
651 delta = data->base - base;
652 fd = file_open(data->path, "IDX-NEW", true, false);
655 if (fstat(fd, &st) < 0) {
656 warn("tradindexed: cannot stat %s.IDX-NEW", data->path);
660 /* For convenience, memory map the old index file. */
662 if (!map_index(data))
665 /* Write the contents of the old index file to the new index file. */
666 if (lseek(fd, delta * sizeof(struct index_entry), SEEK_SET) < 0) {
667 syswarn("tradindexed: cannot seek in %s.IDX-NEW", data->path);
670 if (xwrite(fd, data->index, data->indexlen) < 0) {
671 syswarn("tradindexed: cannot write to %s.IDX-NEW", data->path);
675 syswarn("tradindexed: cannot close %s.IDX-NEW", data->path);
679 data->indexinode = st.st_ino;
685 idxfile = concat(data->path, ".IDX-NEW", (char *) 0);
686 if (unlink(idxfile) < 0)
687 syswarn("tradindexed: cannot unlink %s", idxfile);
695 ** Finish the process of packing a group by replacing the new index with the
696 ** old index. Also reopen the index file and update indexinode to keep our
697 ** caller from having to close and reopen the index file themselves.
700 tdx_data_pack_finish(struct group_data *data)
706 newidx = concat(data->path, ".IDX-NEW", (char *) 0);
707 idx = concat(data->path, ".IDX", (char *) 0);
708 if (rename(newidx, idx) < 0) {
709 syswarn("tradindexed: cannot rename %s to %s", newidx, idx);
717 if (!file_open_index(data, NULL))
725 ** Open the data files for a group data rebuild, and return a struct
726 ** group_data for the new files. Calling this function doesn't interfere
727 ** with the existing data for the group. Either tdx_data_rebuild_abort or
728 ** tdx_data_rebuild_finish should be called on the returned struct group_data
729 ** when the caller is done.
732 tdx_data_rebuild_start(const char *group)
734 struct group_data *data;
736 data = tdx_data_new(group, true);
737 tdx_data_delete(group, "-NEW");
738 if (!file_open_index(data, "IDX-NEW"))
740 if (!file_open_data(data, "DAT-NEW"))
745 tdx_data_delete(group, "-NEW");
746 tdx_data_close(data);
752 ** Finish a rebuild by renaming the new index and data files to their
756 tdx_data_rebuild_finish(const char *group)
758 char *base, *newidx, *bakidx, *idx, *newdat, *dat;
761 base = group_path(group);
762 idx = concat(base, ".IDX", (char *) 0);
763 newidx = concat(base, ".IDX-NEW", (char *) 0);
764 bakidx = concat(base, ".IDX-BAK", (char *) 0);
765 dat = concat(base, ".DAT", (char *) 0);
766 newdat = concat(base, ".DAT-NEW", (char *) 0);
768 if (rename(idx, bakidx) < 0) {
769 syswarn("tradindexed: cannot rename %s to %s", idx, bakidx);
774 if (rename(newidx, idx) < 0) {
775 syswarn("tradindexed: cannot rename %s to %s", newidx, idx);
778 if (rename(newdat, dat) < 0) {
779 syswarn("tradindexed: cannot rename %s to %s", newdat, dat);
782 if (unlink(bakidx) < 0)
783 syswarn("tradindexed: cannot remove backup %s", bakidx);
792 if (saved && rename(bakidx, idx) < 0)
793 syswarn("tradindexed: cannot restore old index %s", bakidx);
804 ** Do the main work of expiring a group. Step through each article in the
805 ** group, only writing the unexpired entries out to the new group. There's
806 ** probably some room for optimization here for newsgroups that don't expire
807 ** so that the files don't have to be rewritten, or newsgroups where all the
808 ** data at the end of the file is still good and just needs to be moved
812 tdx_data_expire_start(const char *group, struct group_data *data,
813 struct group_entry *index, struct history *history)
815 struct group_data *new_data;
816 struct search *search;
817 struct article article;
820 new_data = tdx_data_rebuild_start(group);
821 if (new_data == NULL)
823 index->indexinode = new_data->indexinode;
825 /* Try to make sure that the search range is okay for even an empty group
826 so that we can treat all errors on opening a search as errors. */
827 high = index->high > 0 ? index->high : data->base;
828 new_data->high = high;
829 search = tdx_search_open(data, data->base, high, high);
833 /* Loop through all of the articles in the group, adding the ones that are
834 still valid to the new index. */
835 while (tdx_search(search, &article)) {
838 if (!SMprobe(EXPENSIVESTAT, &article.token, NULL) || OVstatall) {
839 ah = SMretrieve(article.token, RETR_STAT);
844 if (!OVhisthasmsgid(history, article.overview))
847 if (innconf->groupbaseexpiry)
848 if (OVgroupbasedexpire(article.token, group, article.overview,
849 article.overlen, article.arrived,
852 if (!tdx_data_store(new_data, &article))
854 if (index->base == 0) {
855 index->base = new_data->base;
856 index->low = article.number;
858 if (article.number > index->high)
859 index->high = article.number;
863 /* Done; the rest happens in tdx_data_rebuild_finish. */
864 tdx_data_close(new_data);
868 tdx_data_delete(group, "-NEW");
869 tdx_data_close(new_data);
875 ** Close the data files for a group and free the data structure.
878 tdx_data_close(struct group_data *data)
882 if (data->indexfd >= 0)
883 close(data->indexfd);
884 if (data->datafd >= 0)
892 ** Delete the data files for a particular group, called when that group is
893 ** deleted from the server. Takes an optional suffix, which if present is
894 ** appended to the ends of the file names (used by expire to delete the -NEW
895 ** versions of the files).
898 tdx_data_delete(const char *group, const char *suffix)
900 char *path, *idx, *dat;
902 path = group_path(group);
903 idx = concat(path, ".IDX", suffix, (char *) 0);
904 dat = concat(path, ".DAT", suffix, (char *) 0);
905 if (unlink(idx) < 0 && errno != ENOENT)
906 syswarn("tradindexed: cannot unlink %s", idx);
907 if (unlink(dat) < 0 && errno != ENOENT)
908 syswarn("tradindexed: cannot unlink %s", dat);
916 ** RECOVERY AND AUDITING
918 ** All code below this point is not used in the normal operations of the
919 ** overview method. Instead, it's code to dump various data structures or
920 ** audit them for consistency, used by recovery tools and inspection tools.
924 ** Dump the index file for a given group in human-readable format.
927 tdx_data_index_dump(struct group_data *data, FILE *output)
930 struct index_entry *entry, *end;
932 if (data->index == NULL)
933 if (!map_index(data))
936 current = data->base;
937 end = data->index + (data->indexlen / sizeof(struct index_entry));
938 for (entry = data->index; entry < end; entry++) {
939 fprintf(output, "%lu %lu %lu %lu %lu %s\n", current,
940 (unsigned long) entry->offset, (unsigned long) entry->length,
941 (unsigned long) entry->arrived,
942 (unsigned long) entry->expires, TokenToText(entry->token));
949 ** Audit a specific index entry for a particular article. If there's
950 ** anything wrong with it, we delete it; to repair a particular group, it's
951 ** best to just regenerate it from scratch.
954 entry_audit(struct group_data *data, struct index_entry *entry,
955 const char *group, ARTNUM article, bool fix)
957 struct index_entry new_entry;
960 if (entry->length < 0) {
961 warn("tradindexed: negative length %d in %s:%lu", entry->length,
967 if (entry->offset > data->datalen || entry->length > data->datalen) {
968 warn("tradindexed: offset %lu or length %lu out of bounds for %s:%lu",
969 (unsigned long) entry->offset, (unsigned long) entry->length,
975 if (entry->offset + entry->length > data->datalen) {
976 warn("tradindexed: offset %lu plus length %lu out of bounds for"
977 " %s:%lu", (unsigned long) entry->offset,
978 (unsigned long) entry->length, group, article);
983 if (!overview_check(data->data + entry->offset, entry->length, article)) {
984 warn("tradindexed: malformed overview data for %s:%lu", group,
993 new_entry.offset = 0;
994 new_entry.length = 0;
995 offset = (entry - data->index) * sizeof(struct index_entry);
996 if (xpwrite(data->indexfd, &new_entry, sizeof(new_entry), offset) != 0)
997 warn("tradindexed: unable to repair %s:%lu", group, article);
1002 ** Audit the data for a particular group. Takes the index entry from the
1003 ** group.index file and optionally corrects any problems with the data or the
1004 ** index entry based on the contents of the data.
1007 tdx_data_audit(const char *group, struct group_entry *index, bool fix)
1009 struct group_data *data;
1010 struct index_entry *entry;
1013 unsigned long entries, current;
1015 bool changed = false;
1017 data = tdx_data_new(group, true);
1018 if (!tdx_data_open_files(data))
1020 if (!map_index(data))
1022 if (!map_data(data))
1025 /* Check the inode of the index. */
1026 if (data->indexinode != index->indexinode) {
1027 warn("tradindexed: index inode mismatch for %s: %lu != %lu", group,
1028 (unsigned long) data->indexinode,
1029 (unsigned long) index->indexinode);
1031 index->indexinode = data->indexinode;
1036 /* Check the index size. */
1037 entries = data->indexlen / sizeof(struct index_entry);
1038 expected = entries * sizeof(struct index_entry);
1039 if (data->indexlen != expected) {
1040 warn("tradindexed: %lu bytes of trailing trash in %s.IDX",
1041 (unsigned long)(data->indexlen - expected), data->path);
1044 if (ftruncate(data->indexfd, expected) < 0)
1045 syswarn("tradindexed: cannot truncate %s.IDX", data->path);
1046 if (!map_index(data))
1051 /* Now iterate through all of the index entries. In addition to checking
1052 each one individually, also count the number of valid entries to check
1053 the count in the index and verify that the low water mark is
1055 for (current = 0, count = 0; current < entries; current++) {
1056 entry = &data->index[current];
1057 if (entry->length == 0)
1059 entry_audit(data, entry, group, index->base + current, fix);
1060 if (entry->length != 0) {
1062 low = index->base + current;
1066 if (index->low != low && entries != 0) {
1067 warn("tradindexed: low water mark incorrect for %s: %lu != %lu",
1068 group, low, index->low);
1074 if (index->count != count) {
1075 warn("tradindexed: count incorrect for %s: %lu != %lu", group,
1076 (unsigned long) count, (unsigned long) index->count);
1078 index->count = count;
1083 /* All done. Close things down and flush the data we changed, if
1086 inn_mapcntl(index, sizeof(*index), MS_ASYNC);
1089 tdx_data_close(data);