chiark / gitweb /
Fix operational problems with persistently-deferring peers
authorIan Jackson <ijackson@chiark.greenend.org.uk>
Fri, 18 Oct 2024 16:21:08 +0000 (17:21 +0100)
committerIan Jackson <ijackson@chiark.greenend.org.uk>
Fri, 18 Oct 2024 16:36:51 +0000 (17:36 +0100)
See the new comment.  This situation would cause a lot of wasteful
resource usage, and a very large debug log.

Signed-off-by: Ian Jackson <ijackson@chiark.greenend.org.uk>
duct.c
innduct.h
recv.c

diff --git a/duct.c b/duct.c
index bf8c8087842c986400cfd1598bf3191876f1b046..b76cdf8b3dbd49e7b10c2fc4aea1fb1738bdd973 100644 (file)
--- a/duct.c
+++ b/duct.c
@@ -47,6 +47,8 @@ int target_max_feedfile_size=100000;
 int period_seconds=30;
 int filepoll_seconds=5;
 int max_queue_per_ipf=-1;
+int defer_all_max_per_sec=10;
+int defer_all_burst_pause=10;
 
 int connection_setup_timeout=200;
 int inndcomm_flush_timeout=100;
@@ -359,6 +361,8 @@ static const Option innduct_options[]= {
 
 {0,"no-check-proportion",   "PERCENT",   &nocheck_thresh,       op_double   },
 {0,"no-check-response-time","ARTICLES",  &nocheck_decay,        op_double   },
+{0,"defer-all-max-per-sec", "ARTICLES",  &defer_all_max_per_sec, op_integer  },
+{0,"defer-all-burst-pause", "TIME",      &defer_all_burst_pause, op_seconds  },
 
 {0,"reconnect-interval",     "PERIOD", &reconnect_delay_periods,  op_seconds },
 {0,"flush-retry-interval",   "PERIOD", &flushfail_retry_periods,  op_seconds },
index c85d057386bc3c5e413ed32be0939754195f928c..c24f8ecab76c3c6deeb4083539be74a4c56bf426 100644 (file)
--- a/innduct.h
+++ b/innduct.h
@@ -151,6 +151,7 @@ extern const char *inndconffile;
 extern int max_connections, max_queue_per_conn, target_max_feedfile_size;
 extern int period_seconds, filepoll_seconds, max_queue_per_ipf;
 extern int connection_setup_timeout, inndcomm_flush_timeout;
+extern int defer_all_max_per_sec, defer_all_burst_pause;
 
 extern double nocheck_thresh;
 extern double nocheck_decay;
diff --git a/recv.c b/recv.c
index 44587bf1fb954dca661bf9e1a0569b1f5b26b26a..1fb746a15399255f4e91fbbcd67211cbdc05b8bc 100644 (file)
--- a/recv.c
+++ b/recv.c
@@ -12,6 +12,8 @@
 
 /*========== handling responses from peer ==========*/
 
+static int defer_consecutive_count;
+
 const oop_rd_style peer_rd_style= {
   OOP_RD_DELIM_STRIP, '\n',
   OOP_RD_NUL_FORBID,
@@ -193,6 +195,7 @@ void *peer_rd_ok(oop_source *lp, oop_read *oread, oop_rd_event ev,
     code_streaming= (streaming);                               \
     GET_ARTICLE(musthavesent);                                 \
     article_done(art, RC_##how);                               \
+    defer_consecutive_count = 0;                                \
     goto dealtwith;                                            \
   }while(0)
 
@@ -241,6 +244,24 @@ void *peer_rd_ok(oop_source *lp, oop_read *oread, oop_rd_event ev,
   case 436: /* IHAVE says try later */
     GET_ARTICLE(0);
     article_defer(art, RC_deferred);
+
+    // Some implementations don't reject connections when they're expiring
+    // etc., but accept them and defer every article.  If such a situation
+    // persists, we will run through the entire backlog each retry interval
+    // getting the same answer to each.  We try to detect this.  If we only
+    // seem to be getting deferrals, we pause the whole feed.
+    defer_consecutive_count += 1;
+    if (defer_consecutive_count
+       > defer_all_max_per_sec * defer_all_burst_pause) {
+      warn("%d consecutive deferrals, pausing %ds; last response: %s",
+          defer_consecutive_count, defer_all_burst_pause, data);
+      defer_consecutive_count = 0;
+      // This causes the *whole program* including the entire event loop
+      // to freeze.  That is, surprisingly, what we want.  When we wake up
+      // hopefully things are better.  If not we will keep sleeping here,
+      // and making no progress - which is as expected.
+      sleep(defer_all_burst_pause);
+    }
     break;
 
   }