From 00a781fb098e0dd81f55dc56965c09573366d995 Mon Sep 17 00:00:00 2001
From: Alan King <alanking@renci.org>
Date: Mon, 24 Jun 2024 16:54:01 -0400
Subject: [PATCH] [#7725] test_irodsctl: Adjust assertions for delay server
 leader

If the delay server leader is the local server running the test, there
should be at least 3 processes reported by the library function called
capture_process_tree. If the delay server leader is NOT the local server
running the test, there should be at least 2 processes reported. The
likelihood of there being more than 2 processes in the case where the
local server is not the delay server leader is not very high which can
lead to occasional failures unrelated to the issue being tested.

The assertions for the test called
test_agents_kept_alive_by_connected_clients_are_cleaned_up_after_failed_graceful_shutdown__issue_7619
have been adjusted to account for these cases, and extensive explanatory
comments have been added to the code to explain why the test changes
based on this condition.
---
 scripts/irods/test/test_irodsctl.py | 32 ++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/scripts/irods/test/test_irodsctl.py b/scripts/irods/test/test_irodsctl.py
index 06d3ccaca1..861043f60c 100644
--- a/scripts/irods/test/test_irodsctl.py
+++ b/scripts/irods/test/test_irodsctl.py
@@ -64,14 +64,40 @@ def test_agents_kept_alive_by_connected_clients_are_cleaned_up_after_failed_grac
                 # an agent exists for the istream call.
                 lib.delayAssert(lambda: lib.replica_exists(user, logical_path, 0))
 
-                # Collect all the server processes. Should be 3-4 depending on whether the delay server is querying.
-                # capture_process_tree does not count the grandpa process.
+                # Collect all the server processes. capture_process_tree does not count the grandpa process.
+                # The expected number of descendant processes can vary due to timing, unfortunately. Here are the cases:
+                #
+                #  1. When there is no delay server and the main server is not running a query to see whether it should
+                #     spawn a delay server, there will be 2 processes: The agent factory and the agent servicing istream
+                #  2. When there is no delay server and the main server is running a query to see whether it should
+                #     spawn a delay server, there will be 3 processes: The agent factory, the agent servicing istream,
+                #     and the agent servicing the query from the main server
+                #  3. When there is a delay server and the main server is not running a query to see whether it should
+                #     tear down the delay server, and the delay server is not running a query for work to do, there
+                #     will be 3 processes: The agent factory, the delay server, and the agent servicing istream
+                #  4. When there is a delay server and the main server is running a query to see whether it should tear
+                #     down the delay server, OR the delay server is running a query for work to do, there will be 4
+                #     processes: The agent factory, the delay server, the agent servicing istream, and the agent
+                #     servicing the query from either the delay server or the main server
+                #  5. When there is a delay server and the main server is running a query to see whether it should tear
+                #     down the delay server, AND the delay server is running a query for work to do, there will be 5
+                #     processes: The agent factory, the delay server, the agent servicing istream, the agent servicing
+                #     the query from the delay server, and the agent servicing the query from the main server
+                #
+                # Because the queries could be running at any time, we check for a minimum number of server processes
+                # based on whether the "leader" for the delay server is the host running the test. There should be at
+                # least 2 processes for the case where the local server is not running the delay server (cases 1 - 2)
+                # and at least 3 processes in the case where the local server is running the delay server (cases 3 - 5).
+                delay_server_info = json.loads(
+                    admin_session.run_icommand(['iadmin', 'get_delay_server_info'])[0].strip())
+                minimum_expected_process_count = 2 if delay_server_info['leader'] != lib.get_hostname() else 3
+
                 server_descendants_before_shutdown = set()
                 self.assertTrue(
                     capture_process_tree(
                         server_proc, server_descendants_before_shutdown, IrodsController().server_binaries))
                 print(server_descendants_before_shutdown) # debugging
-                self.assertGreaterEqual(len(server_descendants_before_shutdown), 3)
+                self.assertGreaterEqual(len(server_descendants_before_shutdown), minimum_expected_process_count)
 
                 # Shut down the server - this will take a while as it will need to time out first.
                 assert_command(