From: Ian Jackson Date: Fri, 18 Oct 2024 16:21:08 +0000 (+0100) Subject: Fix operational problems with persistently-deferring peers X-Git-Url: http://www.chiark.greenend.org.uk/ucgi/~ian/git?a=commitdiff_plain;h=0e21f4a2d8773249bb78b9c32e07aa7e2798e640;p=innduct.git Fix operational problems with persistently-deferring peers See the new comment. This situation would cause a lot of wasteful resource usage, and a very large debug log. Signed-off-by: Ian Jackson --- diff --git a/duct.c b/duct.c index bf8c808..b76cdf8 100644 --- a/duct.c +++ b/duct.c @@ -47,6 +47,8 @@ int target_max_feedfile_size=100000; int period_seconds=30; int filepoll_seconds=5; int max_queue_per_ipf=-1; +int defer_all_max_per_sec=10; +int defer_all_burst_pause=10; int connection_setup_timeout=200; int inndcomm_flush_timeout=100; @@ -359,6 +361,8 @@ static const Option innduct_options[]= { {0,"no-check-proportion", "PERCENT", &nocheck_thresh, op_double }, {0,"no-check-response-time","ARTICLES", &nocheck_decay, op_double }, +{0,"defer-all-max-per-sec", "ARTICLES", &defer_all_max_per_sec, op_integer }, +{0,"defer-all-burst-pause", "TIME", &defer_all_burst_pause, op_seconds }, {0,"reconnect-interval", "PERIOD", &reconnect_delay_periods, op_seconds }, {0,"flush-retry-interval", "PERIOD", &flushfail_retry_periods, op_seconds }, diff --git a/innduct.h b/innduct.h index c85d057..c24f8ec 100644 --- a/innduct.h +++ b/innduct.h @@ -151,6 +151,7 @@ extern const char *inndconffile; extern int max_connections, max_queue_per_conn, target_max_feedfile_size; extern int period_seconds, filepoll_seconds, max_queue_per_ipf; extern int connection_setup_timeout, inndcomm_flush_timeout; +extern int defer_all_max_per_sec, defer_all_burst_pause; extern double nocheck_thresh; extern double nocheck_decay; diff --git a/recv.c b/recv.c index 44587bf..1fb746a 100644 --- a/recv.c +++ b/recv.c @@ -12,6 +12,8 @@ /*========== handling responses from peer ==========*/ +static int defer_consecutive_count; + const oop_rd_style peer_rd_style= { OOP_RD_DELIM_STRIP, '\n', OOP_RD_NUL_FORBID, @@ -193,6 +195,7 @@ void *peer_rd_ok(oop_source *lp, oop_read *oread, oop_rd_event ev, code_streaming= (streaming); \ GET_ARTICLE(musthavesent); \ article_done(art, RC_##how); \ + defer_consecutive_count = 0; \ goto dealtwith; \ }while(0) @@ -241,6 +244,24 @@ void *peer_rd_ok(oop_source *lp, oop_read *oread, oop_rd_event ev, case 436: /* IHAVE says try later */ GET_ARTICLE(0); article_defer(art, RC_deferred); + + // Some implementations don't reject connections when they're expiring + // etc., but accept them and defer every article. If such a situation + // persists, we will run through the entire backlog each retry interval + // getting the same answer to each. We try to detect this. If we only + // seem to be getting deferrals, we pause the whole feed. + defer_consecutive_count += 1; + if (defer_consecutive_count + > defer_all_max_per_sec * defer_all_burst_pause) { + warn("%d consecutive deferrals, pausing %ds; last response: %s", + defer_consecutive_count, defer_all_burst_pause, data); + defer_consecutive_count = 0; + // This causes the *whole program* including the entire event loop + // to freeze. That is, surprisingly, what we want. When we wake up + // hopefully things are better. If not we will keep sleeping here, + // and making no progress - which is as expected. + sleep(defer_all_burst_pause); + } break; }