From 01fd4269fa9ffc605f1cadfa7a6a38b3ee8f3042 Mon Sep 17 00:00:00 2001 From: Benoit Chesneau Date: Mon, 25 May 2026 18:11:37 +0200 Subject: [PATCH] fix(pool): tolerate closing-connection race in checkout (#850) A checkout that races a server-side close crashed the pool GenServer: find_available calls hackney_conn:is_ready and returns {ok, connected}, then the checkout did ok = hackney_conn:set_owner(Pid, Requester). The two are separate gen_statem calls, so a tcp_closed can be processed in between, leaving set_owner to reply {error, invalid_state} during the closed grace window and failing the hard match. Checkout now handles {error, _} from set_owner and starts a fresh connection. The async checkin/prewarm path (set_owner_async) had the same race silently: a pooled connection that already closed now stops on the cast so the pool's monitor drops it instead of handing it out. Spec for set_owner/2 corrected to ok | {error, invalid_state}. --- NEWS.md | 11 +++++++++ src/hackney_conn.erl | 10 +++++++- src/hackney_pool.erl | 20 +++++++++++++--- test/hackney_conn_tests.erl | 46 ++++++++++++++++++++++++++++++++++++- 4 files changed, 82 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3abc54c8..22765ee8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,16 @@ # NEWS +4.0.2 - unreleased +------------------ + +### Bug Fixes + +- Fix an intermittent pool crash when a server closes a pooled keep-alive + connection during checkout (#850). The checkout now tolerates the + `set_owner` race and falls through to a fresh connection instead of crashing + on a bad match, and an async ownership handoff to an already-closed pooled + connection stops it promptly so the pool drops it from rotation. + 4.0.1 - 2026-05-25 ------------------ diff --git a/src/hackney_conn.erl b/src/hackney_conn.erl index 9563a40d..4a81d6d1 100644 --- a/src/hackney_conn.erl +++ b/src/hackney_conn.erl @@ -459,7 +459,7 @@ release_to_pool(Pid) -> %% This updates the process being monitored - if the new owner crashes, %% the connection will terminate. Used by the pool when checking out %% a connection to a new requester. --spec set_owner(pid(), pid()) -> ok. +-spec set_owner(pid(), pid()) -> ok | {error, invalid_state}. set_owner(Pid, NewOwner) -> gen_statem:call(Pid, {set_owner, NewOwner}, 5000). @@ -1496,6 +1496,14 @@ closed({call, From}, get_state, _Data) -> closed(info, {'DOWN', Ref, process, _Pid, _Reason}, #conn_data{owner_mon = Ref} = Data) -> {stop, normal, Data}; +closed(cast, {set_owner, _NewOwner}, #conn_data{pool_pid = PoolPid} = Data) + when is_pid(PoolPid) -> + %% #850: the pool tried to hand ownership to a pooled connection that + %% already closed (async checkin/prewarm raced a server-side close). Stop + %% now so the pool's monitor removes us from `available` promptly, instead + %% of lingering through the grace window and being handed out again. + {stop, normal, Data}; + closed(EventType, Event, Data) -> handle_common(EventType, Event, closed, Data). diff --git a/src/hackney_pool.erl b/src/hackney_pool.erl index d554a873..c1aaaf93 100644 --- a/src/hackney_pool.erl +++ b/src/hackney_pool.erl @@ -447,9 +447,23 @@ handle_call({checkout, Key, Requester, Opts}, _From, State) -> {ok, Pid, Available2} -> %% Found an available connection - update owner to new requester ?report_debug("pool: reusing connection", [{pool, PoolName}, {pid, Pid}]), - ok = hackney_conn:set_owner(Pid, Requester), - InUse2 = maps:put(Pid, Key, InUse), - {reply, {ok, Pid}, State#state{available=Available2, in_use=InUse2}}; + case hackney_conn:set_owner(Pid, Requester) of + ok -> + InUse2 = maps:put(Pid, Key, InUse), + {reply, {ok, Pid}, State#state{available=Available2, in_use=InUse2}}; + {error, _} -> + %% #850: the connection closed between is_ready and + %% set_owner (server-side close race). It is already out of + %% Available2; drop it and start a fresh connection rather + %% than crashing the pool on a bad match. + case start_connection(Key, Requester, Opts, State#state{available=Available2}) of + {ok, Pid2, State2} -> + InUse2 = maps:put(Pid2, Key, State2#state.in_use), + {reply, {ok, Pid2}, State2#state{in_use=InUse2}}; + {error, Reason} -> + {reply, {error, Reason}, State#state{available=Available2}} + end + end; none when TotalInUse >= MaxConn -> %% At max connections - return error immediately (no queue) %% Note: In the new architecture, load_regulation handles waiting diff --git a/test/hackney_conn_tests.erl b/test/hackney_conn_tests.erl index b33f29bc..be1b0505 100644 --- a/test/hackney_conn_tests.erl +++ b/test/hackney_conn_tests.erl @@ -28,7 +28,11 @@ hackney_conn_test_() -> {"pre-established socket starts connected", fun test_preestablished_socket/0}, {"connect timeout", fun test_connect_timeout/0}, {"connect to invalid host", fun test_connect_invalid/0}, - {"owner death stops connection", fun test_owner_death/0} + {"owner death stops connection", fun test_owner_death/0}, + {"set_owner on closed connection returns invalid_state (#850)", + fun test_set_owner_closed_returns_error/0}, + {"set_owner_async stops a closed pooled connection (#850)", + fun test_set_owner_async_closed_pooled_stops/0} ]}. %% Integration tests - use embedded Cowboy server @@ -232,6 +236,46 @@ test_owner_death() -> %% Connection should have stopped ?assertNot(is_process_alive(Pid)). +%% #850: when a checkout races a server-side close, the pool calls set_owner on +%% a connection that has just transitioned to `closed`. It must get +%% {error, invalid_state} back (so the pool can fall through to a fresh +%% connection) rather than crash. A non-pooled connection has no grace timer, +%% so it stays in `closed` to answer. +test_set_owner_closed_returns_error() -> + {Pid, ListenSock} = connected_conn(#{}), + ?assertEqual({ok, connected}, hackney_conn:get_state(Pid)), + ok = hackney_conn:close(Pid), + ?assertEqual({ok, closed}, hackney_conn:get_state(Pid)), + ?assertEqual({error, invalid_state}, hackney_conn:set_owner(Pid, self())), + hackney_conn:stop(Pid), + gen_tcp:close(ListenSock). + +%% #850: the async ownership handoff (pool checkin/prewarm) to a *pooled* +%% connection that already closed must stop it promptly so the pool's monitor +%% removes it from `available`, instead of lingering through the grace window. +%% Dying within ~10 ms (well under ?CLOSED_GRACE_MS = 50 ms) shows the cast +%% stopped it, not the grace timer. +test_set_owner_async_closed_pooled_stops() -> + {Pid, ListenSock} = connected_conn(#{pool_pid => self()}), + ?assertEqual({ok, connected}, hackney_conn:get_state(Pid)), + ok = hackney_conn:close(Pid), + hackney_conn:set_owner_async(Pid, self()), + timer:sleep(10), + ?assertNot(is_process_alive(Pid)), + gen_tcp:close(ListenSock). + +%% Start a hackney_conn already in `connected` state via a pre-established +%% local socket pair (no server needed). Extra opts are merged in. +connected_conn(Extra) -> + {ok, ListenSock} = gen_tcp:listen(0, [binary, {active, false}]), + {ok, Port} = inet:port(ListenSock), + {ok, ClientSock} = gen_tcp:connect("127.0.0.1", Port, [binary, {active, false}]), + {ok, _ServerSock} = gen_tcp:accept(ListenSock, 1000), + Opts = maps:merge(#{host => "127.0.0.1", port => Port, + transport => hackney_tcp, socket => ClientSock}, Extra), + {ok, Pid} = hackney_conn:start_link(Opts), + {Pid, ListenSock}. + %%==================================================================== %% Request/Response Tests %%====================================================================