From be06111617647ce0a0019bcaa9cdfc63cf148f5f Mon Sep 17 00:00:00 2001 From: David Vasandani Date: Fri, 24 Oct 2025 21:42:21 -0700 Subject: [PATCH] Add configurable restart strategies to prevent unexpected downtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add .gitignore to exclude Python cache files - Implement configurable restart strategies for ECS External instances - Update README with comprehensive documentation 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .gitignore | 40 +++ README.md | 86 ++++++- .../ecs-external-instance-network-sentry.py | 227 ++++++++++++++++-- 3 files changed, 329 insertions(+), 24 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7988ff2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual environments +venv/ +ENV/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log diff --git a/README.md b/README.md index 8e5468f..8ee2896 100644 --- a/README.md +++ b/README.md @@ -50,12 +50,18 @@ In reference to the diagram: - eINS updates Docker restart policy updated to `on-failure` for each ECS managed container [4]. This ensures that any ECS managed containers will be restarted if exiting due to error, the Docker daemon is restarted, or the external instance is rebooted. - When the ECS control-plane becomes reachable: - - ECS managed containers that have been automatically restarted by the Docker daemon during network outage are stopped and removed.** + - The behavior depends on the configured `--restart-strategy`: + - **`cleanup` (default)**: ECS managed containers that have been automatically restarted by the Docker daemon during network outage are stopped and removed.** ECS will relaunch these tasks. + - **`preserve`**: Restarted containers continue running. They may become orphaned by ECS but remain available. + - **`graceful-cutover`**: Agent remains paused until ECS launches replacement containers, then restarted containers are stopped. + - **`manual`**: Agent remains paused, requiring manual intervention before any changes. - ECS managed containers that have not been automatically restarted during network outage have their Docker restart policy set back to `no`. - - The local ECS agent is un-paused. + - The local ECS agent is un-paused (except for `manual` strategy when containers were restarted). > *At this point the operational environment has been restored back to the [Connected Operation](#Connected-Operation) scenario. eINS will continue to monitor for network outage or ECS control-plane error.* + > *For critical services where unexpected downtime must be avoided, see the [Configuration Parameters](#Configuration-Parameters) section for information on the `--restart-strategy` parameter.* + #### Notes *ECS agent is paused, as if left in a running state the agent will detect and kill ECS managed containers that have been restarted by the Docker daemon during the period of network outage. @@ -156,6 +162,68 @@ Specify log data event severity. $ python3 ecs-external-instance-network-sentry.py --region ap-southeast-2 --interval 15 --retries 5 --logfile /mypath/myfile.log --loglevel DEBUG ``` +##### `--restart-strategy` + +Specify the strategy for handling containers that were restarted during a network outage when connectivity is restored. This parameter is critical for deployments where unexpected container downtime must be avoided. + +- optional=yes + +- default=cleanup + +- choices=cleanup, preserve, graceful-cutover, manual + +**Available Strategies:** + +1. **cleanup** (default) - Original behavior + - Stops and removes containers that were restarted during the outage + - ECS control-plane will relaunch these tasks once the agent reconnects + - Results in brief downtime during the transition + - Best for: Non-critical services where brief downtime is acceptable + +2. **preserve** - Keep all restarted containers running + - Does NOT stop or remove containers that restarted during the outage + - Simply resets their restart policy back to "no" and unpauses the ECS agent + - Containers continue running without interruption + - May result in orphaned containers (no longer managed by ECS) + - Best for: Critical services where zero downtime is required and you can handle orphaned containers manually during maintenance + +3. **graceful-cutover** - Wait for ECS to launch replacements before stopping old containers + - Keeps ECS agent paused after connectivity is restored + - Waits for ECS control-plane to launch replacement containers + - Only stops old restarted containers after replacements are detected and running + - Provides zero-downtime cutover from restarted containers to ECS-managed replacements + - Has configurable timeout (see `--cutover-timeout`) + - Best for: Critical services that need zero downtime but want ECS to eventually take over management + +4. **manual** - Require manual intervention + - Keeps ECS agent paused when connectivity returns if any containers were restarted + - Resets restart policies but does NOT stop restarted containers + - Logs a warning requiring manual review and intervention + - Operator must manually unpause the ECS agent when ready + - Best for: High-security or highly-critical environments where operator approval is required before any changes + +```bash +# Example: Use preserve strategy for critical services +$ python3 ecs-external-instance-network-sentry.py --region ap-southeast-2 --restart-strategy preserve + +# Example: Use graceful-cutover with custom timeout +$ python3 ecs-external-instance-network-sentry.py --region ap-southeast-2 --restart-strategy graceful-cutover --cutover-timeout 600 +``` + +##### `--cutover-timeout` + +Specify the timeout in seconds for the `graceful-cutover` strategy to wait for ECS to launch replacement containers. If the timeout is reached before replacements are detected, the agent will be unpaused and a warning will be logged requiring manual intervention. + +- optional=yes + +- default=300 (5 minutes) + +- only applies when `--restart-strategy graceful-cutover` is used + +```bash +$ python3 ecs-external-instance-network-sentry.py --region ap-southeast-2 --restart-strategy graceful-cutover --cutover-timeout 600 +``` + ## Installation It's recommended that the external instance first be registered with ECS before installing the eINS. Installation instructions for eINS are provided below in the correct order of precedence. @@ -347,9 +415,19 @@ Logfile will rotate at 5Mb and a history of the five most recent logfiles will b ## Considerations -The eINS currently has the following limitation: +### Restart Strategy Selection + +The default `cleanup` restart strategy will stop and remove containers that were restarted during a network outage when connectivity is restored. This results in brief downtime as ECS relaunches the tasks. + +**For critical services where unexpected downtime must be avoided**, consider using one of the alternative restart strategies: + +- **`graceful-cutover`** (recommended for most critical services): Waits for ECS to launch replacement containers before stopping the restarted ones, providing zero-downtime transition while maintaining ECS control. + +- **`preserve`**: Keeps restarted containers running indefinitely, accepting that they may become orphaned by ECS. You'll need to manually clean up orphaned containers during maintenance windows. + +- **`manual`**: Requires operator approval before any changes are made after connectivity is restored, giving you full control over the transition. - - As described in the [Disconnected Operation](#Disconnected-Operation) section, containers that have been restarted during a period where the ECS control-plane is unavailable will be stopped once the ECS control-plane becomes available. +See the [Configuration Parameters](#Configuration-Parameters) section for detailed information on each restart strategy and the `--restart-strategy` parameter. ## Security diff --git a/python/ecs-external-instance-network-sentry.py b/python/ecs-external-instance-network-sentry.py index d7b8d98..6c7e809 100755 --- a/python/ecs-external-instance-network-sentry.py +++ b/python/ecs-external-instance-network-sentry.py @@ -28,6 +28,11 @@ help="Logfile name & location.") ap.add_argument("-k", "--loglevel", default="DEBUG", required=False, help="Log data event severity.") +ap.add_argument("-s", "--restart-strategy", default="cleanup", required=False, + choices=['cleanup', 'preserve', 'graceful-cutover', 'manual'], + help="Strategy for handling restarted containers when connectivity returns. 'cleanup' (default): stop and remove restarted containers; 'preserve': keep restarted containers running; 'graceful-cutover': wait for ECS replacements before stopping; 'manual': require manual intervention.") +ap.add_argument("-t", "--cutover-timeout", default=300, required=False, type=int, + help="Timeout in seconds for graceful-cutover strategy to wait for replacement containers (default: 300).") args = vars(ap.parse_args()) # - internal variables.. client = docker.from_env() @@ -36,6 +41,10 @@ ecs_request_data = "GET / HTTP/1.1\r\nHost: " + ecs_host + "\r\nAccept: text/html\r\n\r\n" port = 443 all_data=[] +# - state tracking for graceful-cutover.. +cutover_in_progress = False +cutover_start_time = None +restarted_containers = {} # maps container_id -> container metadata # logging: # - configure logging.. @@ -58,6 +67,8 @@ logging.info("[startup] arg - retries: " + str(args["retries"])) logging.info("[startup] arg - logfile: " + str(args["logfile"])) logging.info("[startup] arg - loglevel: logging." + str(args["loglevel"])) +logging.info("[startup] arg - restart-strategy: " + str(args["restart_strategy"])) +logging.info("[startup] arg - cutover-timeout: " + str(args["cutover_timeout"])) # main logic as infinite loop.. while True: @@ -130,33 +141,209 @@ else: logging.info("[ecs-online] ecs is reachable..") - for container in client.containers.list(): - - if container.name != "ecs-agent": - if "com.amazonaws.ecs.cluster" in container.labels: - - # update ecs managed containers: - # - stop & remove containers that have restarted.. - if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": - if container.attrs["RestartCount"] > 0: - logging.info("[ecs-online] container name: " + str(container.name)) + + # strategy: cleanup (default - original behavior) + if args["restart_strategy"] == "cleanup": + logging.info("[ecs-online] using 'cleanup' strategy - will stop and remove restarted containers..") + for container in client.containers.list(): + + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + + # update ecs managed containers: + # - stop & remove containers that have restarted.. + if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": + if container.attrs["RestartCount"] > 0: + logging.info("[ecs-online] container name: " + str(container.name)) + logging.info("[ecs-online] ecs cluster: " + str(container.labels["com.amazonaws.ecs.cluster"])) + logging.info("[ecs-online] container has been restarted by docker, stopping & removing..") + container.stop() + container.remove() + # - update restart policy for containers that have not restarted.. + else: + container.update(restart_policy={"Name": "no"}) + container.reload() + logging.info("[ecs-online] container name: " + str(container.name)) + logging.info("[ecs-online] ecs cluster: " + str(container.labels["com.amazonaws.ecs.cluster"])) + logging.info("[ecs-online] set container restart policy: " + str(container.attrs["HostConfig"]["RestartPolicy"])) + + # unpause the ecs agent.. + if container.name == "ecs-agent": + if (container.attrs["State"]["Status"]) == "paused": + container.unpause() + logging.info("[ecs-online] ecs agent unpaused..") + + # strategy: preserve (keep restarted containers running) + elif args["restart_strategy"] == "preserve": + logging.info("[ecs-online] using 'preserve' strategy - keeping all restarted containers running..") + for container in client.containers.list(): + + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + + # update restart policy for all ecs managed containers back to "no" + if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": + container.update(restart_policy={"Name": "no"}) + container.reload() + logging.info("[ecs-online] container name: " + str(container.name)) logging.info("[ecs-online] ecs cluster: " + str(container.labels["com.amazonaws.ecs.cluster"])) - logging.info("[ecs-online] container has been restarted by docker, stopping & removing..") - container.stop() - container.remove() - # - update restart policy for containers that have not restarted.. - else: + if container.attrs["RestartCount"] > 0: + logging.info("[ecs-online] container was restarted during outage - preserving (RestartCount: " + str(container.attrs["RestartCount"]) + ")") + logging.info("[ecs-online] set container restart policy: " + str(container.attrs["HostConfig"]["RestartPolicy"])) + + # unpause the ecs agent.. + if container.name == "ecs-agent": + if (container.attrs["State"]["Status"]) == "paused": + container.unpause() + logging.info("[ecs-online] ecs agent unpaused..") + + # strategy: manual (require manual intervention) + elif args["restart_strategy"] == "manual": + logging.info("[ecs-online] using 'manual' strategy - keeping agent paused, manual intervention required..") + restarted_found = False + for container in client.containers.list(): + + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + + # update restart policy for all ecs managed containers back to "no" + if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": container.update(restart_policy={"Name": "no"}) container.reload() logging.info("[ecs-online] container name: " + str(container.name)) logging.info("[ecs-online] ecs cluster: " + str(container.labels["com.amazonaws.ecs.cluster"])) + if container.attrs["RestartCount"] > 0: + restarted_found = True + logging.info("[ecs-online] container was restarted during outage (RestartCount: " + str(container.attrs["RestartCount"]) + ")") logging.info("[ecs-online] set container restart policy: " + str(container.attrs["HostConfig"]["RestartPolicy"])) - # unpause the ecs agent.. - if container.name == "ecs-agent": - if (container.attrs["State"]["Status"]) == "paused": - container.unpause() - logging.info("[ecs-online] ecs agent unpaused..") + # DO NOT unpause the ecs agent in manual mode + + if restarted_found: + logging.warning("[ecs-online] MANUAL INTERVENTION REQUIRED: Containers were restarted during outage. ECS agent remains PAUSED. Review containers and manually unpause agent when ready.") + else: + logging.info("[ecs-online] No containers were restarted during outage. Unpausing agent..") + for container in client.containers.list(): + if container.name == "ecs-agent": + if (container.attrs["State"]["Status"]) == "paused": + container.unpause() + logging.info("[ecs-online] ecs agent unpaused..") + + # strategy: graceful-cutover (wait for ECS to launch replacements) + elif args["restart_strategy"] == "graceful-cutover": + + # first time detecting connectivity after outage - identify restarted containers + if not cutover_in_progress: + restarted_found = False + for container in client.containers.list(): + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": + if container.attrs["RestartCount"] > 0: + restarted_found = True + # track restarted containers + restarted_containers[container.id] = { + "name": container.name, + "cluster": container.labels.get("com.amazonaws.ecs.cluster", "unknown"), + "task_arn": container.labels.get("com.amazonaws.ecs.task-arn", "unknown"), + "restart_count": container.attrs["RestartCount"] + } + logging.info("[ecs-online] identified restarted container: " + str(container.name) + " (RestartCount: " + str(container.attrs["RestartCount"]) + ")") + + if restarted_found: + cutover_in_progress = True + cutover_start_time = time.time() + logging.info("[ecs-online] using 'graceful-cutover' strategy - starting cutover process..") + logging.info("[ecs-online] keeping agent paused and waiting for ECS to launch replacement containers (timeout: " + str(args["cutover_timeout"]) + "s)..") + else: + # no containers were restarted, just restore normal operation + logging.info("[ecs-online] using 'graceful-cutover' strategy - no containers were restarted, resuming normal operation..") + for container in client.containers.list(): + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": + container.update(restart_policy={"Name": "no"}) + container.reload() + if container.name == "ecs-agent": + if (container.attrs["State"]["Status"]) == "paused": + container.unpause() + logging.info("[ecs-online] ecs agent unpaused..") + + # cutover in progress - check if we should complete it + if cutover_in_progress: + elapsed_time = time.time() - cutover_start_time + logging.info("[ecs-online] cutover in progress (elapsed: " + str(int(elapsed_time)) + "s / timeout: " + str(args["cutover_timeout"]) + "s)..") + + # check if timeout has been reached + if elapsed_time >= args["cutover_timeout"]: + logging.warning("[ecs-online] cutover timeout reached! Manual intervention may be required.") + logging.warning("[ecs-online] Restarted containers are still running. Review and manually stop/remove them if needed.") + # reset restart policies and unpause agent anyway + for container in client.containers.list(): + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": + container.update(restart_policy={"Name": "no"}) + container.reload() + if container.name == "ecs-agent": + if (container.attrs["State"]["Status"]) == "paused": + container.unpause() + logging.info("[ecs-online] ecs agent unpaused (after timeout)..") + cutover_in_progress = False + restarted_containers = {} + else: + # check if ECS has launched replacement containers + # look for new containers with same task labels but different IDs + current_containers = client.containers.list() + all_restarted_ids = set(restarted_containers.keys()) + + # find containers that might be replacements + # (have ECS cluster label, not in our restarted list, and not the agent) + potential_replacements = [] + for container in current_containers: + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + if container.id not in all_restarted_ids: + # this might be a replacement - check if it's running and healthy + if container.attrs["State"]["Status"] == "running": + # check uptime - consider it stable if running for at least 30 seconds + started_at = container.attrs["State"]["StartedAt"] + # we'll consider any new running container as a potential replacement + potential_replacements.append(container) + + # if we found potential replacements, perform cutover + if len(potential_replacements) > 0: + logging.info("[ecs-online] found " + str(len(potential_replacements)) + " potential replacement container(s), performing cutover..") + + # stop and remove restarted containers + for container_id, metadata in restarted_containers.items(): + try: + container = client.containers.get(container_id) + logging.info("[ecs-online] stopping and removing restarted container: " + str(metadata["name"])) + container.stop() + container.remove() + except Exception as e: + logging.error("[ecs-online] error stopping/removing container " + str(metadata["name"]) + ": " + str(e)) + + # reset restart policies for remaining containers + for container in client.containers.list(): + if container.name != "ecs-agent": + if "com.amazonaws.ecs.cluster" in container.labels: + if (container.attrs["HostConfig"]["RestartPolicy"]["Name"]) == "on-failure": + container.update(restart_policy={"Name": "no"}) + container.reload() + + # unpause the ecs agent + for container in client.containers.list(): + if container.name == "ecs-agent": + if (container.attrs["State"]["Status"]) == "paused": + container.unpause() + logging.info("[ecs-online] ecs agent unpaused - cutover complete!") + + cutover_in_progress = False + restarted_containers = {} + else: + logging.info("[ecs-online] no replacement containers detected yet, waiting.. (agent remains paused)") logging.info("[end] sleeping for " + str(args["interval"]) + " seconds..") time.sleep(int(args["interval"]))