From 0d667dc7163b54f99fad9295da9387f47ddcb366 Mon Sep 17 00:00:00 2001 From: Sebastian Zuchmanski Date: Mon, 27 Apr 2026 10:58:55 +0200 Subject: [PATCH] Fix tend-thread busy-loop on NodeValidator silent failure A bare rescue in NodeValidator#get_hosts lets a half-initialized validator escape to Cluster#create_node, producing a Node with @host = nil that crashes every subsequent tend cycle. The tend loop's sleep on the success path of begin/rescue turned that into a CPU-pegged busy-loop at ~50 errors/sec until pid restart. - NodeValidator: post-condition raises InvalidNode if @name nil or @aliases empty; log swallowed errors at debug instead of bare rescue. - Node::Refresh::Peers: integration-point guard skips half-initialized validators before cluster.create_node (covers the @name-set, aliases-empty path the existing mismatch check cannot catch). - Cluster#launch_tend_thread: explicit begin/rescue with outer sleep prevents busy-loop on persistent tend failures. Specs cover all four NodeValidator silent-failure paths, both half- init shapes in Peers refresh, and rescue-path-sleep behavior; each fix verified by revert-and-rerun. INF-684 --- lib/aerospike/cluster.rb | 12 +- lib/aerospike/node/refresh/peers.rb | 13 ++ lib/aerospike/node_validator.rb | 13 +- spec/aerospike/cluster_spec.rb | 33 +++++ spec/aerospike/node/refresh/peers_spec.rb | 152 ++++++++++++++++++++++ spec/aerospike/node_validator_spec.rb | 48 +++++-- 6 files changed, 251 insertions(+), 20 deletions(-) create mode 100644 spec/aerospike/node/refresh/peers_spec.rb diff --git a/lib/aerospike/cluster.rb b/lib/aerospike/cluster.rb index 7340577f..c0a8497f 100644 --- a/lib/aerospike/cluster.rb +++ b/lib/aerospike/cluster.rb @@ -369,13 +369,17 @@ def launch_tend_thread @tend_thread = Thread.new do Thread.current.abort_on_exception = false loop do - + # Sleep is unconditional: the previous structure put `sleep` on the + # success path of an implicit `begin/rescue`, so a persistent error + # (e.g. a poisoned `@cluster_nodes` entry) busy-looped at full CPU + # and produced ~50 errors/sec. The rescue must not skip the backoff. + begin tend - sleep(@tend_interval / 1000.0) - rescue => e + rescue => e Aerospike.logger.error("Exception occured during tend: #{e}") Aerospike.logger.debug { e.backtrace.join("\n") } - + end + sleep(@tend_interval / 1000.0) end end end diff --git a/lib/aerospike/node/refresh/peers.rb b/lib/aerospike/node/refresh/peers.rb index 5d60b62e..60b82d67 100644 --- a/lib/aerospike/node/refresh/peers.rb +++ b/lib/aerospike/node/refresh/peers.rb @@ -43,6 +43,19 @@ def call(node, peers) begin nv = NodeValidator.new(cluster, host, cluster.connection_timeout, cluster.cluster_name, cluster.tls_options) + # Defense-in-depth at the integration point: refuse a half-initialized + # validator before it reaches `cluster.create_node`. A `nv` with empty + # aliases produces a `Node` with `@host = nil` that crashes every + # subsequent tend cycle. Must run before the mismatch check below — the + # `name == peer.node_name && aliases.empty?` case slips past it cleanly. + if nv.name.nil? || nv.aliases.empty? + ::Aerospike.logger.warn( + "Skipping peer #{peer.node_name} at #{host}: validator returned " \ + "name=#{nv.name.inspect} aliases=#{nv.aliases.size}" + ) + next + end + if nv.name != peer.node_name ::Aerospike.logger.warn("Peer node #{peer.node_name} is different than actual node #{nv.name} for host #{host}"); # Must look for new node name in the unlikely event that node names do not agree. diff --git a/lib/aerospike/node_validator.rb b/lib/aerospike/node_validator.rb index 5a5fe36b..919418e6 100644 --- a/lib/aerospike/node_validator.rb +++ b/lib/aerospike/node_validator.rb @@ -35,6 +35,15 @@ def initialize(cluster, host, timeout, cluster_name, tls_options = {}) resolve(host.name).each do |address| @aliases += get_hosts(address) end + + # Reject a half-initialized validator: `get_hosts` swallows connection + # errors silently, so without this guard a degenerate `nv` (no name, no + # aliases) escapes to `Cluster#create_node` and produces a `Node` with + # `@host = nil` that crashes on the next tend cycle. + if @name.nil? || @aliases.empty? + raise ::Aerospike::Exceptions::InvalidNode, + "Failed to validate node at #{host}: name=#{@name.inspect} aliases=#{@aliases.size}" + end end private @@ -66,8 +75,8 @@ def get_hosts(address) end res = aliases.map { |al| Host.new(al[:address], al[:port], host.tls_name) } - rescue - # we don't care about the actual connection error; Just need to continue + rescue => e + ::Aerospike.logger.debug { "NodeValidator: get_hosts(#{address}) failed: #{e.class}: #{e.message}" } ensure conn.close if conn end diff --git a/spec/aerospike/cluster_spec.rb b/spec/aerospike/cluster_spec.rb index 3ffa3909..e34bee2d 100644 --- a/spec/aerospike/cluster_spec.rb +++ b/spec/aerospike/cluster_spec.rb @@ -144,4 +144,37 @@ it { is_expected.to be false } end end + + describe '#launch_tend_thread (Fix C: rescue path must not skip sleep)' do + subject(:launch!) { instance.send(:launch_tend_thread) } + + let(:tend_calls) { ::Aerospike::Atomic.new(0) } + let(:sleep_calls) { ::Aerospike::Atomic.new(0) } + + before do + instance.instance_variable_set(:@tend_interval, 1) + allow(instance).to receive(:tend) do + tend_calls.update { |c| c + 1 } + raise StandardError, 'persistent tend failure' + end + allow(instance).to receive(:sleep) do |_| + calls = sleep_calls.update { |c| c + 1 } + Thread.exit if calls >= 3 + end + allow(::Aerospike.logger).to receive(:error) + allow(::Aerospike.logger).to receive(:debug) + end + + after do + instance.instance_variable_get(:@tend_thread)&.kill + end + + it 'sleeps once per iteration even when tend raises persistently (no busy-loop)' do + launch! + instance.instance_variable_get(:@tend_thread).join(2) + + expect(sleep_calls.value).to eq(tend_calls.value) + expect(sleep_calls.value).to be >= 3 + end + end end diff --git a/spec/aerospike/node/refresh/peers_spec.rb b/spec/aerospike/node/refresh/peers_spec.rb new file mode 100644 index 00000000..48e735b0 --- /dev/null +++ b/spec/aerospike/node/refresh/peers_spec.rb @@ -0,0 +1,152 @@ +# frozen_string_literal: true + +# Copyright 2026 Aerospike, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +RSpec.describe Aerospike::Node::Refresh::Peers do + subject(:call!) { described_class.call(node, peers) } + + let(:node) { double('node') } + let(:cluster) { double('cluster') } + let(:tend_connection) { double('tend_connection') } + let(:peers) { ::Aerospike::Peers.new } + let(:peers_generation) { ::Aerospike::Node::Generation.new } + let(:peers_count) { ::Aerospike::Atomic.new(0) } + + let(:peer_node_name) { 'BB92118BAB14E12' } + let(:peer_host) { ::Aerospike::Host.new('172.31.73.252', 3000) } + let(:peer) do + ::Aerospike::Peer.new.tap do |p| + p.node_name = peer_node_name + p.hosts = [peer_host] + end + end + + let(:fetch_collection) do + instance_double( + ::Aerospike::Peers::Parse::Object, + generation: 1, + peers: [peer] + ) + end + + before do + allow(node).to receive(:cluster).and_return(cluster) + allow(node).to receive(:tend_connection).and_return(tend_connection) + allow(node).to receive(:peers_count).and_return(peers_count) + allow(node).to receive(:peers_generation).and_return(peers_generation) + allow(node).to receive(:failures).and_return(::Aerospike::Atomic.new(0)) + allow(node).to receive(:active?).and_return(true) + allow(node).to receive(:name).and_return('node-self') + + allow(cluster).to receive(:connection_timeout).and_return(1000) + allow(cluster).to receive(:cluster_name).and_return('test') + allow(cluster).to receive(:tls_options).and_return({}) + allow(cluster).to receive(:find_node_by_name).and_return(nil) + allow(cluster).to receive(:create_node) + + allow(::Aerospike::Peers::Fetch).to receive(:call).and_return(fetch_collection) + allow(::Aerospike::Cluster::FindNode).to receive(:call).and_return(nil) + allow(::Aerospike::Node::Refresh::Failed).to receive(:call) + allow(::Aerospike.logger).to receive(:warn) + end + + context 'happy path: validator returns matching name and non-empty aliases' do + let(:new_node) { instance_double(::Aerospike::Node) } + let(:nv) do + instance_double( + ::Aerospike::NodeValidator, + name: peer_node_name, + aliases: [peer_host] + ) + end + + before do + allow(::Aerospike::NodeValidator).to receive(:new).and_return(nv) + allow(cluster).to receive(:create_node).with(nv).and_return(new_node) + call! + end + + it 'creates the node and stores it in peers under the validator name' do + expect(cluster).to have_received(:create_node).with(nv) + expect(peers.nodes[peer_node_name]).to eq(new_node) + expect(peers_generation.changed?).to be(true) + end + end + + context 'guard fires when validator name is nil' do + let(:nv) do + instance_double(::Aerospike::NodeValidator, name: nil, aliases: []) + end + + before do + allow(::Aerospike::NodeValidator).to receive(:new).and_return(nv) + call! + end + + it 'skips create_node, leaves peers.nodes empty, and warns' do + expect(cluster).not_to have_received(:create_node) + expect(peers.nodes).to be_empty + expect(::Aerospike.logger).to have_received(:warn).with(/Skipping peer.*name=nil/) + end + end + + context 'guard fires when name matches but aliases is empty (path 4)' do + let(:nv) do + instance_double(::Aerospike::NodeValidator, name: peer_node_name, aliases: []) + end + + before do + allow(::Aerospike::NodeValidator).to receive(:new).and_return(nv) + call! + end + + it 'still skips create_node — mismatch check below cannot catch this' do + expect(cluster).not_to have_received(:create_node) + expect(peers.nodes).to be_empty + expect(::Aerospike.logger).to have_received(:warn).with(/aliases=0/) + end + end + + context 'continues iterating peer.hosts after a guarded skip (next, not break)' do + let(:peer_host_b) { ::Aerospike::Host.new('172.31.73.253', 3000) } + let(:peer) do + ::Aerospike::Peer.new.tap do |p| + p.node_name = peer_node_name + p.hosts = [peer_host, peer_host_b] + end + end + + let(:bad_nv) do + instance_double(::Aerospike::NodeValidator, name: nil, aliases: []) + end + let(:good_nv) do + instance_double(::Aerospike::NodeValidator, name: peer_node_name, aliases: [peer_host_b]) + end + let(:new_node) { instance_double(::Aerospike::Node) } + + before do + allow(::Aerospike::NodeValidator).to receive(:new) do |_, host, *| + host == peer_host ? bad_nv : good_nv + end + allow(cluster).to receive(:create_node).with(good_nv).and_return(new_node) + call! + end + + it 'validates the second host and stores its node' do + expect(cluster).to have_received(:create_node).with(good_nv) + expect(peers.nodes[peer_node_name]).to eq(new_node) + end + end +end diff --git a/spec/aerospike/node_validator_spec.rb b/spec/aerospike/node_validator_spec.rb index 81a93205..801af38e 100644 --- a/spec/aerospike/node_validator_spec.rb +++ b/spec/aerospike/node_validator_spec.rb @@ -19,10 +19,9 @@ let(:hosts) { [::Aerospike::Host.new('127.0.0.1', '3000')] } before do - allow(socket).to receive(:write).and_return(nil) - allow(socket).to receive(:read).and_return(nil) allow(socket).to receive(:timeout=).and_return(nil) - expect(::Aerospike::Cluster::CreateConnection).to receive(:call).and_return(socket) + allow(::Aerospike::Cluster::CreateConnection).to receive(:call).and_return(socket) + allow(::Aerospike::Info).to receive(:request).and_return({ 'node' => 'test-node' }) end it { expect(validator.aliases).to eq(hosts) } @@ -35,7 +34,7 @@ allow(socket).to receive(:timeout=).and_return(nil) allow(::Aerospike::Cluster::CreateConnection).to receive(:call).and_return(socket) - expect(::Aerospike::Info).to receive(:request).and_return( + allow(::Aerospike::Info).to receive(:request).and_return( { 'node' => 'test-node', 'service-clear-std' => '192.168.1.1:3000' @@ -53,15 +52,13 @@ before do allow(socket).to receive(:timeout=).and_return(nil) allow(::Aerospike::Cluster::CreateConnection).to receive(:call).and_return(socket) - expect(Resolv).to receive(:getaddresses).and_return(['101.1.1.1', '102.1.1.1']) + allow(Resolv).to receive(:getaddresses).and_return(['101.1.1.1', '102.1.1.1']) - expect(::Aerospike::Info).to receive(:request).and_return( + allow(::Aerospike::Info).to receive(:request).and_return( { 'node' => 'test-node', 'service-clear-std' => '101.1.1.2:3002,101.1.1.3:3003' - } - ) - expect(::Aerospike::Info).to receive(:request).and_return( + }, { 'node' => 'test-node', 'service-clear-std' => '102.1.1.2:3002,102.1.1.3:3003' @@ -84,15 +81,13 @@ before do allow(socket).to receive(:timeout=).and_return(nil) allow(::Aerospike::Cluster::CreateConnection).to receive(:call).and_return(socket) - expect(Resolv).to receive(:getaddresses).and_return(['101.1.1.1', '102.1.1.1']) + allow(Resolv).to receive(:getaddresses).and_return(['101.1.1.1', '102.1.1.1']) - expect(::Aerospike::Info).to receive(:request).and_return( + allow(::Aerospike::Info).to receive(:request).and_return( { 'node' => 'test-node', 'service-tls-std' => '101.1.1.2:3002,101.1.1.3:3003' - } - ) - expect(::Aerospike::Info).to receive(:request).and_return( + }, { 'node' => 'test-node', 'service-tls-std' => '102.1.1.2:3002,102.1.1.3:3003' @@ -107,4 +102,29 @@ ) end end + + describe 'post-condition: refuses half-initialized validators' do + let(:hosts) { [::Aerospike::Host.new('192.168.1.1', '3000')] } + + before do + allow(socket).to receive(:timeout=).and_return(nil) + allow(::Aerospike::Cluster::CreateConnection).to receive(:call).and_return(socket) + end + + context 'when get_hosts fails before assigning @name' do + before { allow(::Aerospike::Info).to receive(:request).and_raise(Errno::ETIMEDOUT) } + + it 'raises InvalidNode' do + expect { validator }.to raise_error(::Aerospike::Exceptions::InvalidNode, /name=nil aliases=0/) + end + end + + context 'when @name is set but aliases stay empty (path 4)' do + before { allow(::Aerospike::Info).to receive(:request).and_return({ 'node' => 'test-node' }) } + + it 'raises InvalidNode even though @name was assigned' do + expect { validator }.to raise_error(::Aerospike::Exceptions::InvalidNode, /aliases=0/) + end + end + end end