From 01fd4269fa9ffc605f1cadfa7a6a38b3ee8f3042 Mon Sep 17 00:00:00 2001
From: Benoit Chesneau <bchesneau@gmail.com>
Date: Mon, 25 May 2026 18:11:37 +0200
Subject: [PATCH] fix(pool): tolerate closing-connection race in checkout
 (#850)

A checkout that races a server-side close crashed the pool GenServer:
find_available calls hackney_conn:is_ready and returns {ok, connected},
then the checkout did ok = hackney_conn:set_owner(Pid, Requester). The
two are separate gen_statem calls, so a tcp_closed can be processed in
between, leaving set_owner to reply {error, invalid_state} during the
closed grace window and failing the hard match.

Checkout now handles {error, _} from set_owner and starts a fresh
connection. The async checkin/prewarm path (set_owner_async) had the
same race silently: a pooled connection that already closed now stops on
the cast so the pool's monitor drops it instead of handing it out. Spec
for set_owner/2 corrected to ok | {error, invalid_state}.
---
 NEWS.md                     | 11 +++++++++
 src/hackney_conn.erl        | 10 +++++++-
 src/hackney_pool.erl        | 20 +++++++++++++---
 test/hackney_conn_tests.erl | 46 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 3abc54c8..22765ee8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,16 @@
 # NEWS
 
+4.0.2 - unreleased
+------------------
+
+### Bug Fixes
+
+- Fix an intermittent pool crash when a server closes a pooled keep-alive
+  connection during checkout (#850). The checkout now tolerates the
+  `set_owner` race and falls through to a fresh connection instead of crashing
+  on a bad match, and an async ownership handoff to an already-closed pooled
+  connection stops it promptly so the pool drops it from rotation.
+
 4.0.1 - 2026-05-25
 ------------------
 
diff --git a/src/hackney_conn.erl b/src/hackney_conn.erl
index 9563a40d..4a81d6d1 100644
--- a/src/hackney_conn.erl
+++ b/src/hackney_conn.erl
@@ -459,7 +459,7 @@ release_to_pool(Pid) ->
 %% This updates the process being monitored - if the new owner crashes,
 %% the connection will terminate. Used by the pool when checking out
 %% a connection to a new requester.
--spec set_owner(pid(), pid()) -> ok.
+-spec set_owner(pid(), pid()) -> ok | {error, invalid_state}.
 set_owner(Pid, NewOwner) ->
     gen_statem:call(Pid, {set_owner, NewOwner}, 5000).
 
@@ -1496,6 +1496,14 @@ closed({call, From}, get_state, _Data) ->
 closed(info, {'DOWN', Ref, process, _Pid, _Reason}, #conn_data{owner_mon = Ref} = Data) ->
     {stop, normal, Data};
 
+closed(cast, {set_owner, _NewOwner}, #conn_data{pool_pid = PoolPid} = Data)
+  when is_pid(PoolPid) ->
+    %% #850: the pool tried to hand ownership to a pooled connection that
+    %% already closed (async checkin/prewarm raced a server-side close). Stop
+    %% now so the pool's monitor removes us from `available` promptly, instead
+    %% of lingering through the grace window and being handed out again.
+    {stop, normal, Data};
+
 closed(EventType, Event, Data) ->
     handle_common(EventType, Event, closed, Data).
 
diff --git a/src/hackney_pool.erl b/src/hackney_pool.erl
index d554a873..c1aaaf93 100644
--- a/src/hackney_pool.erl
+++ b/src/hackney_pool.erl
@@ -447,9 +447,23 @@ handle_call({checkout, Key, Requester, Opts}, _From, State) ->
         {ok, Pid, Available2} ->
             %% Found an available connection - update owner to new requester
             ?report_debug("pool: reusing connection", [{pool, PoolName}, {pid, Pid}]),
-            ok = hackney_conn:set_owner(Pid, Requester),
-            InUse2 = maps:put(Pid, Key, InUse),
-            {reply, {ok, Pid}, State#state{available=Available2, in_use=InUse2}};
+            case hackney_conn:set_owner(Pid, Requester) of
+                ok ->
+                    InUse2 = maps:put(Pid, Key, InUse),
+                    {reply, {ok, Pid}, State#state{available=Available2, in_use=InUse2}};
+                {error, _} ->
+                    %% #850: the connection closed between is_ready and
+                    %% set_owner (server-side close race). It is already out of
+                    %% Available2; drop it and start a fresh connection rather
+                    %% than crashing the pool on a bad match.
+                    case start_connection(Key, Requester, Opts, State#state{available=Available2}) of
+                        {ok, Pid2, State2} ->
+                            InUse2 = maps:put(Pid2, Key, State2#state.in_use),
+                            {reply, {ok, Pid2}, State2#state{in_use=InUse2}};
+                        {error, Reason} ->
+                            {reply, {error, Reason}, State#state{available=Available2}}
+                    end
+            end;
         none when TotalInUse >= MaxConn ->
             %% At max connections - return error immediately (no queue)
             %% Note: In the new architecture, load_regulation handles waiting
diff --git a/test/hackney_conn_tests.erl b/test/hackney_conn_tests.erl
index b33f29bc..be1b0505 100644
--- a/test/hackney_conn_tests.erl
+++ b/test/hackney_conn_tests.erl
@@ -28,7 +28,11 @@ hackney_conn_test_() ->
       {"pre-established socket starts connected", fun test_preestablished_socket/0},
       {"connect timeout", fun test_connect_timeout/0},
       {"connect to invalid host", fun test_connect_invalid/0},
-      {"owner death stops connection", fun test_owner_death/0}
+      {"owner death stops connection", fun test_owner_death/0},
+      {"set_owner on closed connection returns invalid_state (#850)",
+       fun test_set_owner_closed_returns_error/0},
+      {"set_owner_async stops a closed pooled connection (#850)",
+       fun test_set_owner_async_closed_pooled_stops/0}
      ]}.
 
 %% Integration tests - use embedded Cowboy server
@@ -232,6 +236,46 @@ test_owner_death() ->
     %% Connection should have stopped
     ?assertNot(is_process_alive(Pid)).
 
+%% #850: when a checkout races a server-side close, the pool calls set_owner on
+%% a connection that has just transitioned to `closed`. It must get
+%% {error, invalid_state} back (so the pool can fall through to a fresh
+%% connection) rather than crash. A non-pooled connection has no grace timer,
+%% so it stays in `closed` to answer.
+test_set_owner_closed_returns_error() ->
+    {Pid, ListenSock} = connected_conn(#{}),
+    ?assertEqual({ok, connected}, hackney_conn:get_state(Pid)),
+    ok = hackney_conn:close(Pid),
+    ?assertEqual({ok, closed}, hackney_conn:get_state(Pid)),
+    ?assertEqual({error, invalid_state}, hackney_conn:set_owner(Pid, self())),
+    hackney_conn:stop(Pid),
+    gen_tcp:close(ListenSock).
+
+%% #850: the async ownership handoff (pool checkin/prewarm) to a *pooled*
+%% connection that already closed must stop it promptly so the pool's monitor
+%% removes it from `available`, instead of lingering through the grace window.
+%% Dying within ~10 ms (well under ?CLOSED_GRACE_MS = 50 ms) shows the cast
+%% stopped it, not the grace timer.
+test_set_owner_async_closed_pooled_stops() ->
+    {Pid, ListenSock} = connected_conn(#{pool_pid => self()}),
+    ?assertEqual({ok, connected}, hackney_conn:get_state(Pid)),
+    ok = hackney_conn:close(Pid),
+    hackney_conn:set_owner_async(Pid, self()),
+    timer:sleep(10),
+    ?assertNot(is_process_alive(Pid)),
+    gen_tcp:close(ListenSock).
+
+%% Start a hackney_conn already in `connected` state via a pre-established
+%% local socket pair (no server needed). Extra opts are merged in.
+connected_conn(Extra) ->
+    {ok, ListenSock} = gen_tcp:listen(0, [binary, {active, false}]),
+    {ok, Port} = inet:port(ListenSock),
+    {ok, ClientSock} = gen_tcp:connect("127.0.0.1", Port, [binary, {active, false}]),
+    {ok, _ServerSock} = gen_tcp:accept(ListenSock, 1000),
+    Opts = maps:merge(#{host => "127.0.0.1", port => Port,
+                        transport => hackney_tcp, socket => ClientSock}, Extra),
+    {ok, Pid} = hackney_conn:start_link(Opts),
+    {Pid, ListenSock}.
+
 %%====================================================================
 %% Request/Response Tests
 %%====================================================================