From 0aee1658b80718b48d797ac93fc5ff716c08e5ec Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Tue, 14 Apr 2026 18:24:22 +0800 Subject: [PATCH 1/9] [runtime] Add Fluss backend support for action state store --- .../api/configuration/AgentConfigOptions.java | 38 ++ docs/content/docs/operations/configuration.md | 23 +- docs/content/docs/operations/deployment.md | 2 +- pom.xml | 1 + runtime/pom.xml | 52 +++ .../runtime/actionstate/ActionStateStore.java | 3 +- .../actionstate/FlussActionStateStore.java | 393 +++++++++++++++++ .../operator/ActionExecutionOperator.java | 13 +- .../actionstate/FlussActionStateStoreIT.java | 411 ++++++++++++++++++ 9 files changed, 930 insertions(+), 6 deletions(-) create mode 100644 runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java create mode 100644 runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java diff --git a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java index 724247456..03858c7c2 100644 --- a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java +++ b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java @@ -51,6 +51,44 @@ public class AgentConfigOptions { public static final ConfigOption KAFKA_ACTION_STATE_TOPIC_REPLICATION_FACTOR = new ConfigOption<>("kafkaActionStateTopicReplicationFactor", Integer.class, 1); + /** The config parameter specifies the Fluss bootstrap servers. */ + public static final ConfigOption FLUSS_BOOTSTRAP_SERVERS = + new ConfigOption<>("flussBootstrapServers", String.class, "localhost:9123"); + + /** The config parameter specifies the Fluss database for action state. */ + public static final ConfigOption FLUSS_ACTION_STATE_DATABASE = + new ConfigOption<>("flussActionStateDatabase", String.class, "flink_agents"); + + /** The config parameter specifies the Fluss table name for action state. */ + public static final ConfigOption FLUSS_ACTION_STATE_TABLE = + new ConfigOption<>("flussActionStateTable", String.class, "action_state"); + + /** The config parameter specifies the number of buckets for the Fluss action state table. */ + public static final ConfigOption FLUSS_ACTION_STATE_TABLE_BUCKETS = + new ConfigOption<>("flussActionStateTableBuckets", Integer.class, 8); + + /** The config parameter specifies the authentication protocol for Fluss client. */ + public static final ConfigOption FLUSS_SECURITY_PROTOCOL = + new ConfigOption<>("flussSecurityProtocol", String.class, "PLAINTEXT"); + + /** The config parameter specifies the SASL mechanism for Fluss authentication. */ + public static final ConfigOption FLUSS_SASL_MECHANISM = + new ConfigOption<>("flussSaslMechanism", String.class, "PLAIN"); + + /** + * The config parameter specifies the JAAS configuration string for Fluss SASL authentication. + */ + public static final ConfigOption FLUSS_SASL_JAAS_CONFIG = + new ConfigOption<>("flussSaslJaasConfig", String.class, null); + + /** The config parameter specifies the username for Fluss SASL authentication. */ + public static final ConfigOption FLUSS_SASL_USERNAME = + new ConfigOption<>("flussSaslUsername", String.class, null); + + /** The config parameter specifies the password for Fluss SASL authentication. */ + public static final ConfigOption FLUSS_SASL_PASSWORD = + new ConfigOption<>("flussSaslPassword", String.class, null); + /** The config parameter specifies the unique identifier of job. */ public static final ConfigOption JOB_IDENTIFIER = new ConfigOption<>("job-identifier", String.class, null); diff --git a/docs/content/docs/operations/configuration.md b/docs/content/docs/operations/configuration.md index 67fed224a..e55078c6a 100644 --- a/docs/content/docs/operations/configuration.md +++ b/docs/content/docs/operations/configuration.md @@ -140,14 +140,35 @@ Here is the list of all built-in core configuration options. ### Action State Store +#### Common + +| Key | Default | Type | Description | +|------------------------------|------------------|---------|------------------------------------------------------------------------------------------| +| `actionStateStoreBackend` | (none) | String | The backend for action state store. Supported values: `"kafka"`, `"fluss"`. | + #### Kafka-based Action State Store Here are the configuration options for Kafka-based Action State Store. | Key | Default | Type | Description | |-------------------------------------|--------------------------|---------|-----------------------------------------------------------------------------| -| `actionStateStoreBackend` | (none) | String | The config parameter specifies the backend for action state store. | | `kafkaBootstrapServers` | "localhost:9092" | String | The config parameter specifies the Kafka bootstrap server. | | `kafkaActionStateTopic` | (none) | String | The config parameter specifies the Kafka topic for action state. | | `kafkaActionStateTopicNumPartitions`| 64 | Integer | The config parameter specifies the number of partitions for the Kafka action state topic. | | `kafkaActionStateTopicReplicationFactor` | 1 | Integer | The config parameter specifies the replication factor for the Kafka action state topic. | + +#### Fluss-based Action State Store + +Here are the configuration options for Fluss-based Action State Store. Fluss provides O(1) KV point lookup for action state, eliminating the need for in-memory state rebuilding during recovery. + +| Key | Default | Type | Description | +|------------------------------|------------------|---------|------------------------------------------------------------------------------------------| +| `flussBootstrapServers` | "localhost:9123" | String | The Fluss bootstrap servers address. | +| `flussActionStateDatabase` | "flink_agents" | String | The Fluss database name for storing action state. | +| `flussActionStateTable` | "action_state" | String | The Fluss table name for storing action state. | +| `flussActionStateTableBuckets` | 8 | Integer | The number of buckets for the Fluss action state table. | +| `flussSecurityProtocol` | "PLAINTEXT" | String | The authentication protocol for Fluss client (e.g., `PLAINTEXT`, `SASL_PLAIN`). | +| `flussSaslMechanism` | "PLAIN" | String | The SASL mechanism for Fluss authentication. | +| `flussSaslJaasConfig` | (none) | String | The JAAS configuration string for Fluss SASL authentication. | +| `flussSaslUsername` | (none) | String | The username for Fluss SASL authentication. | +| `flussSaslPassword` | (none) | String | The password for Fluss SASL authentication. | diff --git a/docs/content/docs/operations/deployment.md b/docs/content/docs/operations/deployment.md index bfe609c2a..ca2efc232 100644 --- a/docs/content/docs/operations/deployment.md +++ b/docs/content/docs/operations/deployment.md @@ -172,7 +172,7 @@ After recovery from a checkpoint, Flink Agents reprocess events that arrived aft To ensure exactly-once action consistency, you must configure an external action state store. Flink Agents record action state in this store on a per-action basis. After recovering from a checkpoint, Flink Agents consult the external store and will not re-execute actions that were already completed. This guarantees each action is executed exactly once after recovering from a checkpoint. {{< hint info >}} -**Note**: Currently, Kafka is supported as the external action state store. +**Note**: Currently, Kafka and Fluss are supported as the external action state store. {{< /hint >}} See [Action State Store Configuration]({{< ref "docs/operations/configuration#action-state-store" >}}) for configuration options. diff --git a/pom.xml b/pom.xml index cd7744797..416a41f81 100644 --- a/pom.xml +++ b/pom.xml @@ -43,6 +43,7 @@ under the License. false 2.2.0 4.0.0 + 0.9.0-incubating 5.10.1 2.18.2 0.5.5 diff --git a/runtime/pom.xml b/runtime/pom.xml index 384eeea82..c2ee09449 100644 --- a/runtime/pom.xml +++ b/runtime/pom.xml @@ -116,6 +116,58 @@ under the License. kafka-clients ${kafka.version} + + + org.apache.fluss + fluss-client + ${fluss.version} + + + + org.apache.fluss + fluss-server + ${fluss.version} + test + + + org.apache.fluss + fluss-server + ${fluss.version} + test-jar + test + + + org.apache.fluss + fluss-common + ${fluss.version} + test-jar + test + + + org.apache.fluss + fluss-rpc + ${fluss.version} + test-jar + test + + + org.apache.fluss + fluss-test-utils + ${fluss.version} + test + + + org.apache.curator + curator-test + 5.4.0 + test + + + org.junit.jupiter + junit-jupiter-api + + + org.slf4j diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateStore.java index 433e2beac..e29557c0d 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateStore.java @@ -26,7 +26,8 @@ /** Interface for storing and retrieving the state of actions performed by agents. */ public interface ActionStateStore extends AutoCloseable { enum BackendType { - KAFKA("kafka"); + KAFKA("kafka"), + FLUSS("fluss"); private final String type; diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java new file mode 100644 index 000000000..8408e0c6f --- /dev/null +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -0,0 +1,393 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.agents.runtime.actionstate; + +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.flink.agents.api.Event; +import org.apache.flink.agents.api.InputEvent; +import org.apache.flink.agents.api.OutputEvent; +import org.apache.flink.agents.plan.AgentConfiguration; +import org.apache.flink.agents.plan.actions.Action; +import org.apache.fluss.client.Connection; +import org.apache.fluss.client.ConnectionFactory; +import org.apache.fluss.client.admin.Admin; +import org.apache.fluss.client.lookup.LookupResult; +import org.apache.fluss.client.lookup.Lookuper; +import org.apache.fluss.client.table.Table; +import org.apache.fluss.client.table.writer.UpsertWriter; +import org.apache.fluss.config.Configuration; +import org.apache.fluss.metadata.DatabaseDescriptor; +import org.apache.fluss.metadata.Schema; +import org.apache.fluss.metadata.TableDescriptor; +import org.apache.fluss.metadata.TablePath; +import org.apache.fluss.row.BinaryString; +import org.apache.fluss.row.GenericRow; +import org.apache.fluss.row.InternalRow; +import org.apache.fluss.types.DataTypes; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_DATABASE; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE_BUCKETS; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_BOOTSTRAP_SERVERS; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_JAAS_CONFIG; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_MECHANISM; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_PASSWORD; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_USERNAME; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SECURITY_PROTOCOL; +import static org.apache.flink.agents.runtime.actionstate.ActionStateUtil.generateKey; + +/** + * An implementation of {@link ActionStateStore} that uses Apache Fluss as the backend storage for + * action states. Fluss provides a primary key table with KV point lookup capability, eliminating + * the need for in-memory state rebuilding during recovery. + * + *

Compared to the Kafka-based implementation: + * + *

    + *
  • Recovery: No sequential consumption needed; direct KV point lookup (O(1)). + *
  • Memory: No in-memory cache required; on-demand queries only. + *
  • Write: In-place upsert instead of append-only log. + *
+ * + *

The Fluss table schema: + * + *

+ * state_key   STRING   -- Primary key, generated by ActionStateUtil.generateKey()
+ * state_payload BYTES  -- ActionState JSON serialized bytes
+ * seq_num     BIGINT   -- Sequence number for pruneState()
+ * agent_key   STRING   -- Original key for prefix-based cleanup
+ * completed   BOOLEAN  -- Whether action completed
+ * updated_at  BIGINT   -- Last update timestamp
+ * 
+ */ +public class FlussActionStateStore implements ActionStateStore { + + private static final Logger LOG = LoggerFactory.getLogger(FlussActionStateStore.class); + private static final ObjectMapper OBJECT_MAPPER = createObjectMapper(); + + /** Creates and configures the ObjectMapper for ActionState serialization. */ + private static ObjectMapper createObjectMapper() { + ObjectMapper mapper = new ObjectMapper(); + // Add type information for polymorphic Event deserialization + mapper.addMixIn(Event.class, EventTypeInfoMixin.class); + mapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); + mapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); + return mapper; + } + + /** Mixin to add type information for Event hierarchy. */ + @JsonTypeInfo( + use = JsonTypeInfo.Id.CLASS, + include = JsonTypeInfo.As.PROPERTY, + property = "@class") + abstract static class EventTypeInfoMixin {} + + // Column indices in the Fluss table schema + private static final int COL_STATE_KEY = 0; + private static final int COL_STATE_PAYLOAD = 1; + private static final int COL_SEQ_NUM = 2; + private static final int COL_AGENT_KEY = 3; + private static final int COL_COMPLETED = 4; + private static final int COL_UPDATED_AT = 5; + private static final int NUM_COLUMNS = 6; + + private final AgentConfiguration agentConfiguration; + private final String databaseName; + private final String tableName; + private final TablePath tablePath; + + private final Connection connection; + private final Table table; + private final UpsertWriter writer; + private final Lookuper lookuper; + + /** + * Tracks the mapping from agent key to its associated state keys. This is needed because + * pruneState() needs to delete by agent key prefix, and Fluss primary key lookup requires exact + * key match. + */ + private final Map> agentKeyToStateKeys; + + public FlussActionStateStore(AgentConfiguration agentConfiguration) { + this.agentConfiguration = agentConfiguration; + this.databaseName = agentConfiguration.get(FLUSS_ACTION_STATE_DATABASE); + this.tableName = agentConfiguration.get(FLUSS_ACTION_STATE_TABLE); + this.tablePath = TablePath.of(databaseName, tableName); + this.agentKeyToStateKeys = new HashMap<>(); + + // Create Fluss connection + Configuration flussConf = new Configuration(); + flussConf.setString("bootstrap.servers", agentConfiguration.get(FLUSS_BOOTSTRAP_SERVERS)); + + // Security / authentication config + String securityProtocol = agentConfiguration.get(FLUSS_SECURITY_PROTOCOL); + if (securityProtocol != null) { + flussConf.setString("client.security.protocol", securityProtocol); + } + String saslMechanism = agentConfiguration.get(FLUSS_SASL_MECHANISM); + if (saslMechanism != null) { + flussConf.setString("client.security.sasl.mechanism", saslMechanism); + } + String jaasConfig = agentConfiguration.get(FLUSS_SASL_JAAS_CONFIG); + if (jaasConfig != null) { + flussConf.setString("client.security.sasl.jaas.config", jaasConfig); + } + String username = agentConfiguration.get(FLUSS_SASL_USERNAME); + if (username != null) { + flussConf.setString("client.security.sasl.username", username); + } + String password = agentConfiguration.get(FLUSS_SASL_PASSWORD); + if (password != null) { + flussConf.setString("client.security.sasl.password", password); + } + + this.connection = ConnectionFactory.createConnection(flussConf); + + // Ensure database and table exist + maybeCreateDatabaseAndTable(); + + // Get table client and create writer/lookuper + this.table = connection.getTable(tablePath); + this.writer = table.newUpsert().createWriter(); + this.lookuper = table.newLookup().createLookuper(); + + LOG.info("Initialized FlussActionStateStore with table: {}.{}", databaseName, tableName); + } + + @Override + public void put(Object key, long seqNum, Action action, Event event, ActionState state) + throws Exception { + String stateKey = generateKey(key, seqNum, action, event); + byte[] payload = OBJECT_MAPPER.writeValueAsBytes(state); + + GenericRow row = new GenericRow(NUM_COLUMNS); + row.setField(COL_STATE_KEY, BinaryString.fromString(stateKey)); + row.setField(COL_STATE_PAYLOAD, payload); + row.setField(COL_SEQ_NUM, seqNum); + row.setField(COL_AGENT_KEY, BinaryString.fromString(key.toString())); + row.setField(COL_COMPLETED, state.isCompleted()); + row.setField(COL_UPDATED_AT, System.currentTimeMillis()); + + // Synchronous write to ensure durability before returning. + // This is critical for the Reconcile pattern where PENDING slots must be + // persisted before call() executes. + writer.upsert(row).get(); + + // Track the agent key -> state key mapping for pruneState() + agentKeyToStateKeys + .computeIfAbsent(key.toString(), k -> new HashMap<>()) + .put(stateKey, seqNum); + + LOG.debug( + "Stored action state to Fluss: key={}, isCompleted={}", + stateKey, + state.isCompleted()); + } + + @Override + public ActionState get(Object key, long seqNum, Action action, Event event) throws Exception { + String stateKey = generateKey(key, seqNum, action, event); + + LOG.debug("Looking up action state from Fluss: key={}, seqNum={}", key, seqNum); + + GenericRow lookupKey = new GenericRow(1); + lookupKey.setField(0, BinaryString.fromString(stateKey)); + + LookupResult result = lookuper.lookup(lookupKey).get(); + InternalRow row = result.getSingletonRow(); + + if (row == null) { + LOG.debug("Action state not found in Fluss: stateKey={}", stateKey); + return null; + } + + byte[] payload = row.getBytes(COL_STATE_PAYLOAD); + ActionState state = OBJECT_MAPPER.readValue(payload, ActionState.class); + + // Rebuild the agentKeyToStateKeys mapping on read. + // This is critical after recovery: since rebuildState() is a no-op, the mapping + // is empty. Without this, pruneState() would silently skip cleanup for any state + // that was written before the restart, causing unbounded table growth. + agentKeyToStateKeys + .computeIfAbsent(key.toString(), k -> new HashMap<>()) + .put(stateKey, seqNum); + + LOG.debug( + "Found action state in Fluss: key={}, isCompleted={}", + stateKey, + state.isCompleted()); + return state; + } + + /** + * No-op for Fluss backend. Unlike the Kafka implementation which must consume from offsets to + * rebuild an in-memory HashMap, Fluss's KV table is inherently persistent. Each {@link #get} + * call performs a direct point lookup, so no in-memory state needs to be rebuilt. + */ + @Override + public void rebuildState(List recoveryMarkers) { + LOG.info( + "Fluss backend: rebuildState is a no-op (KV table is persistent). " + + "Recovery markers: {}", + recoveryMarkers.size()); + } + + @Override + public void pruneState(Object key, long seqNum) { + LOG.debug("Pruning state for key: {} up to sequence number: {}", key, seqNum); + + Map stateKeys = agentKeyToStateKeys.get(key.toString()); + if (stateKeys == null) { + return; + } + + stateKeys + .entrySet() + .removeIf( + entry -> { + if (entry.getValue() <= seqNum) { + try { + GenericRow deleteRow = new GenericRow(NUM_COLUMNS); + deleteRow.setField( + COL_STATE_KEY, BinaryString.fromString(entry.getKey())); + writer.delete(deleteRow).get(); + LOG.debug("Pruned state key: {}", entry.getKey()); + } catch (Exception e) { + LOG.warn( + "Failed to delete state key during pruning: {}", + entry.getKey(), + e); + } + return true; + } + return false; + }); + + // Remove agent key entry if no more state keys + if (stateKeys.isEmpty()) { + agentKeyToStateKeys.remove(key.toString()); + } + + // Flush to ensure deletes are visible to the lookuper + try { + writer.flush(); + } catch (Exception e) { + LOG.warn("Failed to flush writer after pruning", e); + } + + LOG.debug("Pruned state for key: {} up to sequence number: {}", key, seqNum); + } + + /** + * Returns null because the Fluss KV table is persistent and does not need recovery markers. + * Unlike Kafka which uses partition end offsets as markers, Fluss's point lookup works directly + * without any positional information. + */ + @Override + public Object getRecoveryMarker() { + return null; + } + + @Override + public void close() throws Exception { + Exception firstException = null; + + try { + if (writer != null) { + writer.flush(); + } + } catch (Exception e) { + firstException = e; + } + + try { + if (table != null) { + table.close(); + } + } catch (Exception e) { + if (firstException == null) { + firstException = e; + } + } + + try { + if (connection != null) { + connection.close(); + } + } catch (Exception e) { + if (firstException == null) { + firstException = e; + } + } + + if (firstException != null) { + throw firstException; + } + } + + private void maybeCreateDatabaseAndTable() { + try (Admin admin = connection.getAdmin()) { + // Create database if not exists + if (!admin.databaseExists(databaseName).get()) { + admin.createDatabase(databaseName, DatabaseDescriptor.EMPTY, true).get(); + LOG.info("Created Fluss database: {}", databaseName); + } + + // Create table if not exists + if (!admin.tableExists(tablePath).get()) { + Schema schema = + Schema.newBuilder() + .column("state_key", DataTypes.STRING()) + .column("state_payload", DataTypes.BYTES()) + .column("seq_num", DataTypes.BIGINT()) + .column("agent_key", DataTypes.STRING()) + .column("completed", DataTypes.BOOLEAN()) + .column("updated_at", DataTypes.BIGINT()) + .primaryKey("state_key") + .build(); + + int buckets = agentConfiguration.get(FLUSS_ACTION_STATE_TABLE_BUCKETS); + TableDescriptor descriptor = + TableDescriptor.builder() + .schema(schema) + .distributedBy(buckets, "state_key") + .comment("Flink Agents action state store") + .build(); + + admin.createTable(tablePath, descriptor, true).get(); + LOG.info("Created Fluss table: {}", tablePath); + } else { + LOG.info("Fluss table {} already exists", tablePath); + } + } catch (Exception e) { + LOG.error( + "Failed to create or verify Fluss database/table: {}.{}", + databaseName, + tableName, + e); + throw new RuntimeException("Failed to create or verify Fluss database/table", e); + } + } +} diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/operator/ActionExecutionOperator.java b/runtime/src/main/java/org/apache/flink/agents/runtime/operator/ActionExecutionOperator.java index e5015a3a5..76ac47fc3 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/operator/ActionExecutionOperator.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/operator/ActionExecutionOperator.java @@ -39,6 +39,7 @@ import org.apache.flink.agents.runtime.ResourceCache; import org.apache.flink.agents.runtime.actionstate.ActionState; import org.apache.flink.agents.runtime.actionstate.ActionStateStore; +import org.apache.flink.agents.runtime.actionstate.FlussActionStateStore; import org.apache.flink.agents.runtime.actionstate.KafkaActionStateStore; import org.apache.flink.agents.runtime.async.ContinuationActionExecutor; import org.apache.flink.agents.runtime.async.ContinuationContext; @@ -106,6 +107,7 @@ import static org.apache.flink.agents.api.configuration.AgentConfigOptions.BASE_LOG_DIR; import static org.apache.flink.agents.api.configuration.AgentConfigOptions.JOB_IDENTIFIER; import static org.apache.flink.agents.api.configuration.AgentConfigOptions.PRETTY_PRINT; +import static org.apache.flink.agents.runtime.actionstate.ActionStateStore.BackendType.FLUSS; import static org.apache.flink.agents.runtime.actionstate.ActionStateStore.BackendType.KAFKA; import static org.apache.flink.agents.runtime.utils.StateUtil.*; import static org.apache.flink.util.Preconditions.checkState; @@ -1136,11 +1138,16 @@ private EventLogger createEventLogger(AgentPlan agentPlan) { } private void maybeInitActionStateStore() { - if (actionStateStore == null - && KAFKA.getType() - .equalsIgnoreCase(agentPlan.getConfig().get(ACTION_STATE_STORE_BACKEND))) { + if (actionStateStore != null) { + return; + } + String backend = agentPlan.getConfig().get(ACTION_STATE_STORE_BACKEND); + if (KAFKA.getType().equalsIgnoreCase(backend)) { LOG.info("Using Kafka as backend of action state store."); actionStateStore = new KafkaActionStateStore(agentPlan.getConfig()); + } else if (FLUSS.getType().equalsIgnoreCase(backend)) { + LOG.info("Using Fluss as backend of action state store."); + actionStateStore = new FlussActionStateStore(agentPlan.getConfig()); } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java new file mode 100644 index 000000000..844a4c195 --- /dev/null +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.agents.runtime.actionstate; + +import org.apache.flink.agents.api.Event; +import org.apache.flink.agents.api.InputEvent; +import org.apache.flink.agents.api.context.RunnerContext; +import org.apache.flink.agents.plan.AgentConfiguration; +import org.apache.flink.agents.plan.JavaFunction; +import org.apache.flink.agents.plan.actions.Action; +import org.apache.fluss.client.admin.Admin; +import org.apache.fluss.metadata.TableInfo; +import org.apache.fluss.metadata.TablePath; +import org.apache.fluss.server.testutils.FlussClusterExtension; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.List; + +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_DATABASE; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE_BUCKETS; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_BOOTSTRAP_SERVERS; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +/** Integration tests for {@link FlussActionStateStore} against an embedded Fluss cluster. */ +public class FlussActionStateStoreIT { + + private static final String TEST_DATABASE = "test_flink_agents"; + private static final String TEST_TABLE = "action_state_it"; + private static final String TEST_KEY = "test-key"; + + @RegisterExtension + static final FlussClusterExtension FLUSS_CLUSTER = + FlussClusterExtension.builder().setNumOfTabletServers(1).build(); + + private FlussActionStateStore store; + private Action testAction; + private Event testEvent; + + @BeforeEach + void setUp() throws Exception { + AgentConfiguration config = createAgentConfiguration(); + store = new FlussActionStateStore(config); + + // Wait for table to be ready in the cluster + waitForTableReady(); + + testAction = new TestAction("test-action"); + testEvent = new InputEvent("test-data"); + } + + @AfterEach + void tearDown() throws Exception { + if (store != null) { + store.close(); + } + } + + // ==================== Basic CRUD ==================== + + @Test + void testPutAndGet() throws Exception { + ActionState state = new ActionState(testEvent); + + store.put(TEST_KEY, 1L, testAction, testEvent, state); + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getTaskEvent()).isEqualTo(testEvent); + assertThat(retrieved.isCompleted()).isFalse(); + } + + @Test + void testGetNonExistent() throws Exception { + ActionState result = store.get(TEST_KEY, 999L, testAction, testEvent); + + assertThat(result).isNull(); + } + + @Test + void testMultipleSeqNums() throws Exception { + InputEvent event1 = new InputEvent("data-1"); + InputEvent event2 = new InputEvent("data-2"); + InputEvent event3 = new InputEvent("data-3"); + ActionState state1 = new ActionState(event1); + ActionState state2 = new ActionState(event2); + ActionState state3 = new ActionState(event3); + + store.put(TEST_KEY, 1L, testAction, testEvent, state1); + store.put(TEST_KEY, 2L, testAction, testEvent, state2); + store.put(TEST_KEY, 3L, testAction, testEvent, state3); + + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent).getTaskEvent()).isEqualTo(event1); + assertThat(store.get(TEST_KEY, 2L, testAction, testEvent).getTaskEvent()).isEqualTo(event2); + assertThat(store.get(TEST_KEY, 3L, testAction, testEvent).getTaskEvent()).isEqualTo(event3); + } + + @Test + void testUpsertOverwrite() throws Exception { + ActionState original = new ActionState(new InputEvent("original")); + store.put(TEST_KEY, 1L, testAction, testEvent, original); + + InputEvent updatedEvent = new InputEvent("updated"); + ActionState updated = new ActionState(updatedEvent); + store.put(TEST_KEY, 1L, testAction, testEvent, updated); + + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getTaskEvent()).isEqualTo(updatedEvent); + } + + // ==================== Pruning ==================== + + @Test + void testPruneSingleKey() throws Exception { + store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); + store.put(TEST_KEY, 2L, testAction, testEvent, new ActionState(testEvent)); + store.put(TEST_KEY, 3L, testAction, testEvent, new ActionState(testEvent)); + + store.pruneState(TEST_KEY, 2L); + + // Delete may take time to be applied from changelog to KV store + waitUntilDeleted(() -> store.get(TEST_KEY, 1L, testAction, testEvent)); + waitUntilDeleted(() -> store.get(TEST_KEY, 2L, testAction, testEvent)); + assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); + } + + @Test + void testPruneMultipleAgentKeys() throws Exception { + String keyA = "agent-key-a"; + String keyB = "agent-key-b"; + + store.put(keyA, 1L, testAction, testEvent, new ActionState(testEvent)); + store.put(keyA, 2L, testAction, testEvent, new ActionState(testEvent)); + store.put(keyB, 1L, testAction, testEvent, new ActionState(testEvent)); + store.put(keyB, 2L, testAction, testEvent, new ActionState(testEvent)); + + // Prune only keyA up to seqNum 2 + store.pruneState(keyA, 2L); + + // keyA entries pruned (may take time to propagate) + waitUntilDeleted(() -> store.get(keyA, 1L, testAction, testEvent)); + waitUntilDeleted(() -> store.get(keyA, 2L, testAction, testEvent)); + // keyB entries unaffected + assertThat(store.get(keyB, 1L, testAction, testEvent)).isNotNull(); + assertThat(store.get(keyB, 2L, testAction, testEvent)).isNotNull(); + } + + // ==================== Fluss-Specific Behavior ==================== + + @Test + void testRecoveryMarkerReturnsNull() { + Object marker = store.getRecoveryMarker(); + assertThat(marker).isNull(); + } + + @Test + void testRebuildStateNoOp() throws Exception { + store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); + + // rebuildState should complete without error + store.rebuildState(Collections.emptyList()); + + // Data should still be accessible + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); + } + + // ==================== Durable Execution Patterns ==================== + + @Test + void testCompletionOnlyFlow() throws Exception { + ActionState state = new ActionState(testEvent); + CallResult successResult = + new CallResult( + "myFunction", + "argsDigest123", + "result-payload".getBytes(StandardCharsets.UTF_8)); + state.addCallResult(successResult); + + store.put(TEST_KEY, 1L, testAction, testEvent, state); + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getCallResults()).hasSize(1); + CallResult retrievedResult = retrieved.getCallResult(0); + assertThat(retrievedResult.getFunctionId()).isEqualTo("myFunction"); + assertThat(retrievedResult.getArgsDigest()).isEqualTo("argsDigest123"); + assertThat(retrievedResult.isSuccess()).isTrue(); + assertThat(retrievedResult.getResultPayload()) + .isEqualTo("result-payload".getBytes(StandardCharsets.UTF_8)); + } + + @Test + void testReconcilePendingPersistence() throws Exception { + // Simulate Reconcile pattern: write PENDING state, close store, reopen, verify PENDING + // survives + ActionState state = new ActionState(testEvent); + CallResult pending = CallResult.pending("sideEffectFn", "digest456"); + state.addCallResult(pending); + + store.put(TEST_KEY, 1L, testAction, testEvent, state); + store.close(); + + // Create a new store instance (simulating recovery) + FlussActionStateStore recoveredStore = + new FlussActionStateStore(createAgentConfiguration()); + try { + ActionState recovered = recoveredStore.get(TEST_KEY, 1L, testAction, testEvent); + + assertThat(recovered).isNotNull(); + assertThat(recovered.getCallResults()).hasSize(1); + CallResult recoveredResult = recovered.getCallResult(0); + assertThat(recoveredResult.isPending()).isTrue(); + assertThat(recoveredResult.getFunctionId()).isEqualTo("sideEffectFn"); + assertThat(recoveredResult.getArgsDigest()).isEqualTo("digest456"); + } finally { + recoveredStore.close(); + } + } + + @Test + void testPruneWorksAfterRecovery() throws Exception { + // This tests the critical bug fix: after close+reopen, the in-memory + // agentKeyToStateKeys mapping is empty. The get() method must rebuild + // this mapping so that pruneState() can still delete old records. + store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); + store.put(TEST_KEY, 2L, testAction, testEvent, new ActionState(testEvent)); + store.put(TEST_KEY, 3L, testAction, testEvent, new ActionState(testEvent)); + store.close(); + + // Simulate recovery: new store instance, mapping is empty + FlussActionStateStore recoveredStore = + new FlussActionStateStore(createAgentConfiguration()); + try { + // get() calls rebuild the mapping + assertThat(recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); + assertThat(recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); + assertThat(recoveredStore.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); + + // pruneState should now work because get() rebuilt the mapping + recoveredStore.pruneState(TEST_KEY, 2L); + + waitUntilDeleted(() -> recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)); + waitUntilDeleted(() -> recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)); + assertThat(recoveredStore.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); + } finally { + recoveredStore.close(); + } + } + + @Test + void testMultipleCallResults() throws Exception { + ActionState state = new ActionState(testEvent); + state.addCallResult( + new CallResult("fn1", "d1", "ok".getBytes(StandardCharsets.UTF_8))); // SUCCEEDED + state.addCallResult(CallResult.pending("fn2", "d2")); // PENDING + state.addCallResult( + new CallResult( + "fn3", "d3", null, "error".getBytes(StandardCharsets.UTF_8))); // FAILED + + store.put(TEST_KEY, 1L, testAction, testEvent, state); + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getCallResults()).hasSize(3); + assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); + assertThat(retrieved.getCallResult(1).isPending()).isTrue(); + assertThat(retrieved.getCallResult(2).isFailure()).isTrue(); + } + + @Test + void testCompletedAction() throws Exception { + ActionState state = new ActionState(testEvent); + state.addCallResult(new CallResult("fn1", "d1", "result".getBytes(StandardCharsets.UTF_8))); + state.markCompleted(); + + store.put(TEST_KEY, 1L, testAction, testEvent, state); + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.isCompleted()).isTrue(); + assertThat(retrieved.getCallResults()).isEmpty(); + } + + // ==================== Infrastructure ==================== + + @Test + void testIdempotentTableCreation() throws Exception { + // Creating a second store with the same config should succeed + // (table already exists, createTable with ignoreIfExists=true) + FlussActionStateStore secondStore = new FlussActionStateStore(createAgentConfiguration()); + try { + secondStore.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); + assertThat(secondStore.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); + } finally { + secondStore.close(); + } + } + + @Test + void testCloseIdempotent() throws Exception { + store.close(); + assertThatNoException().isThrownBy(() -> store.close()); + // Set to null to prevent double-close in tearDown + store = null; + } + + // ==================== Serialization ==================== + + @Test + void testFullSerializationRoundTrip() throws Exception { + InputEvent taskEvent = new InputEvent("full-test-data"); + ActionState state = new ActionState(taskEvent); + state.addEvent(new InputEvent("output-event-1")); + state.addEvent(new InputEvent("output-event-2")); + state.addCallResult( + new CallResult("fn1", "digest1", "payload1".getBytes(StandardCharsets.UTF_8))); + state.addCallResult(CallResult.pending("fn2", "digest2")); + + store.put(TEST_KEY, 1L, testAction, testEvent, state); + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getTaskEvent()).isEqualTo(taskEvent); + assertThat(retrieved.getOutputEvents()).hasSize(2); + assertThat(retrieved.getCallResults()).hasSize(2); + assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); + assertThat(retrieved.getCallResult(1).isPending()).isTrue(); + assertThat(retrieved.isCompleted()).isFalse(); + } + + // ==================== Helpers ==================== + + private AgentConfiguration createAgentConfiguration() { + AgentConfiguration config = new AgentConfiguration(); + config.set(FLUSS_BOOTSTRAP_SERVERS, FLUSS_CLUSTER.getBootstrapServers()); + config.set(FLUSS_ACTION_STATE_DATABASE, TEST_DATABASE); + config.set(FLUSS_ACTION_STATE_TABLE, TEST_TABLE); + config.set(FLUSS_ACTION_STATE_TABLE_BUCKETS, 1); + return config; + } + + private void waitForTableReady() throws Exception { + TablePath tablePath = TablePath.of(TEST_DATABASE, TEST_TABLE); + try (Admin admin = + org.apache.fluss.client.ConnectionFactory.createConnection( + FLUSS_CLUSTER.getClientConfig()) + .getAdmin()) { + TableInfo tableInfo = admin.getTableInfo(tablePath).get(); + FLUSS_CLUSTER.waitUntilTableReady(tableInfo.getTableId()); + } + } + + private static class TestAction extends Action { + + public static void doNothing(Event event, RunnerContext context) { + // No operation + } + + public TestAction(String name) throws Exception { + super( + name, + new JavaFunction( + TestAction.class.getName(), + "doNothing", + new Class[] {Event.class, RunnerContext.class}), + List.of(InputEvent.class.getName())); + } + } + + @FunctionalInterface + private interface ThrowingSupplier { + T get() throws Exception; + } + + /** + * Polls until the supplier returns null, with a timeout. Fluss KV deletes may take time to + * propagate from the changelog to the RocksDB KV store. + */ + private static void waitUntilDeleted(ThrowingSupplier supplier) throws Exception { + long deadline = System.currentTimeMillis() + 30_000; + while (System.currentTimeMillis() < deadline) { + if (supplier.get() == null) { + return; + } + Thread.sleep(200); + } + assertThat(supplier.get()).isNull(); + } +} From 710ec84fe951ecfde62c68ebbb67294fbee0e74f Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Wed, 22 Apr 2026 19:36:36 +0800 Subject: [PATCH 2/9] [runtime] Switch Fluss action state backend from KV table to log table --- .../actionstate/FlussActionStateStore.java | 242 ++++++++---------- 1 file changed, 113 insertions(+), 129 deletions(-) diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java index 8408e0c6f..cfd2afbd3 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -27,13 +27,15 @@ import org.apache.fluss.client.Connection; import org.apache.fluss.client.ConnectionFactory; import org.apache.fluss.client.admin.Admin; -import org.apache.fluss.client.lookup.LookupResult; -import org.apache.fluss.client.lookup.Lookuper; import org.apache.fluss.client.table.Table; -import org.apache.fluss.client.table.writer.UpsertWriter; +import org.apache.fluss.client.table.scanner.ScanRecord; +import org.apache.fluss.client.table.scanner.log.LogScanner; +import org.apache.fluss.client.table.scanner.log.ScanRecords; +import org.apache.fluss.client.table.writer.AppendWriter; import org.apache.fluss.config.Configuration; import org.apache.fluss.metadata.DatabaseDescriptor; import org.apache.fluss.metadata.Schema; +import org.apache.fluss.metadata.TableBucket; import org.apache.fluss.metadata.TableDescriptor; import org.apache.fluss.metadata.TablePath; import org.apache.fluss.row.BinaryString; @@ -43,6 +45,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.time.Duration; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -59,27 +62,29 @@ import static org.apache.flink.agents.runtime.actionstate.ActionStateUtil.generateKey; /** - * An implementation of {@link ActionStateStore} that uses Apache Fluss as the backend storage for - * action states. Fluss provides a primary key table with KV point lookup capability, eliminating - * the need for in-memory state rebuilding during recovery. + * An implementation of {@link ActionStateStore} that uses an Apache Fluss log table as the backend. + * All state is maintained in an in-memory HashMap for O(1) lookup, with the Fluss log table + * providing durability and recovery support. * - *

Compared to the Kafka-based implementation: + *

Compared to the previous KV-table implementation: * *

    - *
  • Recovery: No sequential consumption needed; direct KV point lookup (O(1)). - *
  • Memory: No in-memory cache required; on-demand queries only. - *
  • Write: In-place upsert instead of append-only log. + *
  • Read: O(1) in-memory HashMap lookup; no network I/O per query. + *
  • Write: Append to log + update in-memory HashMap. + *
  • Recovery: Scan log from beginning to rebuild in-memory state. + *
  • Prune: In-memory eviction only; physical cleanup relies on Fluss log retention. *
* - *

The Fluss table schema: + *

The Fluss log table schema (no primary key — this creates a log/append-only table): * *

- * state_key   STRING   -- Primary key, generated by ActionStateUtil.generateKey()
- * state_payload BYTES  -- ActionState JSON serialized bytes
- * seq_num     BIGINT   -- Sequence number for pruneState()
- * agent_key   STRING   -- Original key for prefix-based cleanup
- * completed   BOOLEAN  -- Whether action completed
- * updated_at  BIGINT   -- Last update timestamp
+ * state_key     STRING  -- Generated by ActionStateUtil.generateKey()
+ * state_payload BYTES   -- ActionState JSON serialized bytes
+ * seq_num       BIGINT  -- Sequence number for pruneState()
+ * agent_key     STRING  -- Original key; used as bucket key so all records for the same
+ *                          agent are co-located in the same bucket
+ * completed     BOOLEAN -- Whether action completed
+ * updated_at    BIGINT  -- Last update timestamp
  * 
*/ public class FlussActionStateStore implements ActionStateStore { @@ -87,10 +92,16 @@ public class FlussActionStateStore implements ActionStateStore { private static final Logger LOG = LoggerFactory.getLogger(FlussActionStateStore.class); private static final ObjectMapper OBJECT_MAPPER = createObjectMapper(); + /** + * Number of consecutive empty polls before considering the log fully consumed during recovery. + */ + private static final int EMPTY_POLL_THRESHOLD = 3; + + private static final Duration POLL_TIMEOUT = Duration.ofSeconds(1); + /** Creates and configures the ObjectMapper for ActionState serialization. */ private static ObjectMapper createObjectMapper() { ObjectMapper mapper = new ObjectMapper(); - // Add type information for polymorphic Event deserialization mapper.addMixIn(Event.class, EventTypeInfoMixin.class); mapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); mapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); @@ -120,28 +131,21 @@ abstract static class EventTypeInfoMixin {} private final Connection connection; private final Table table; - private final UpsertWriter writer; - private final Lookuper lookuper; + private final AppendWriter writer; - /** - * Tracks the mapping from agent key to its associated state keys. This is needed because - * pruneState() needs to delete by agent key prefix, and Fluss primary key lookup requires exact - * key match. - */ - private final Map> agentKeyToStateKeys; + /** In-memory cache for O(1) state lookups; rebuilt from Fluss log on recovery. */ + private final Map actionStates; public FlussActionStateStore(AgentConfiguration agentConfiguration) { this.agentConfiguration = agentConfiguration; this.databaseName = agentConfiguration.get(FLUSS_ACTION_STATE_DATABASE); this.tableName = agentConfiguration.get(FLUSS_ACTION_STATE_TABLE); this.tablePath = TablePath.of(databaseName, tableName); - this.agentKeyToStateKeys = new HashMap<>(); + this.actionStates = new HashMap<>(); - // Create Fluss connection Configuration flussConf = new Configuration(); flussConf.setString("bootstrap.servers", agentConfiguration.get(FLUSS_BOOTSTRAP_SERVERS)); - // Security / authentication config String securityProtocol = agentConfiguration.get(FLUSS_SECURITY_PROTOCOL); if (securityProtocol != null) { flussConf.setString("client.security.protocol", securityProtocol); @@ -164,16 +168,14 @@ public FlussActionStateStore(AgentConfiguration agentConfiguration) { } this.connection = ConnectionFactory.createConnection(flussConf); - - // Ensure database and table exist maybeCreateDatabaseAndTable(); - - // Get table client and create writer/lookuper this.table = connection.getTable(tablePath); - this.writer = table.newUpsert().createWriter(); - this.lookuper = table.newLookup().createLookuper(); + this.writer = table.newAppend().createWriter(); - LOG.info("Initialized FlussActionStateStore with table: {}.{}", databaseName, tableName); + LOG.info( + "Initialized FlussActionStateStore (log table) with table: {}.{}", + databaseName, + tableName); } @Override @@ -190,124 +192,108 @@ public void put(Object key, long seqNum, Action action, Event event, ActionState row.setField(COL_COMPLETED, state.isCompleted()); row.setField(COL_UPDATED_AT, System.currentTimeMillis()); - // Synchronous write to ensure durability before returning. - // This is critical for the Reconcile pattern where PENDING slots must be - // persisted before call() executes. - writer.upsert(row).get(); + // Append to Fluss log for durability, then update in-memory cache. + // Synchronous write ensures the record is durable before returning. + writer.append(row).get(); + actionStates.put(stateKey, state); - // Track the agent key -> state key mapping for pruneState() - agentKeyToStateKeys - .computeIfAbsent(key.toString(), k -> new HashMap<>()) - .put(stateKey, seqNum); - - LOG.debug( - "Stored action state to Fluss: key={}, isCompleted={}", - stateKey, - state.isCompleted()); + LOG.debug("Stored action state: key={}, isCompleted={}", stateKey, state.isCompleted()); } @Override public ActionState get(Object key, long seqNum, Action action, Event event) throws Exception { String stateKey = generateKey(key, seqNum, action, event); + ActionState state = actionStates.get(stateKey); + LOG.debug("Lookup action state: key={}, found={}", stateKey, state != null); + return state; + } - LOG.debug("Looking up action state from Fluss: key={}, seqNum={}", key, seqNum); - - GenericRow lookupKey = new GenericRow(1); - lookupKey.setField(0, BinaryString.fromString(stateKey)); + /** + * Rebuilds in-memory state by scanning the Fluss log table from the beginning. Reads until the + * log is exhausted ({@link #EMPTY_POLL_THRESHOLD} consecutive empty polls). For the same state + * key appearing multiple times in the log, the latest record wins (last-write-wins). + */ + @Override + public void rebuildState(List recoveryMarkers) { + LOG.info("Rebuilding action state from Fluss log table"); + actionStates.clear(); - LookupResult result = lookuper.lookup(lookupKey).get(); - InternalRow row = result.getSingletonRow(); + int numBuckets = table.getTableInfo().getNumBuckets(); + try (LogScanner scanner = table.newScan().createLogScanner()) { + for (int b = 0; b < numBuckets; b++) { + scanner.subscribeFromBeginning(b); + } - if (row == null) { - LOG.debug("Action state not found in Fluss: stateKey={}", stateKey); - return null; + int emptyPollCount = 0; + while (emptyPollCount < EMPTY_POLL_THRESHOLD) { + ScanRecords records = scanner.poll(POLL_TIMEOUT); + if (records.isEmpty()) { + emptyPollCount++; + } else { + emptyPollCount = 0; + for (TableBucket bucket : records.buckets()) { + for (ScanRecord record : records.records(bucket)) { + InternalRow row = record.getRow(); + String stateKey = row.getString(COL_STATE_KEY).toString(); + byte[] payload = row.getBytes(COL_STATE_PAYLOAD); + try { + ActionState state = + OBJECT_MAPPER.readValue(payload, ActionState.class); + actionStates.put(stateKey, state); + } catch (Exception e) { + LOG.warn( + "Failed to deserialize action state for key: {}, skipping", + stateKey, + e); + } + } + } + } + } + } catch (Exception e) { + throw new RuntimeException("Failed to rebuild state from Fluss log table", e); } - byte[] payload = row.getBytes(COL_STATE_PAYLOAD); - ActionState state = OBJECT_MAPPER.readValue(payload, ActionState.class); - - // Rebuild the agentKeyToStateKeys mapping on read. - // This is critical after recovery: since rebuildState() is a no-op, the mapping - // is empty. Without this, pruneState() would silently skip cleanup for any state - // that was written before the restart, causing unbounded table growth. - agentKeyToStateKeys - .computeIfAbsent(key.toString(), k -> new HashMap<>()) - .put(stateKey, seqNum); - - LOG.debug( - "Found action state in Fluss: key={}, isCompleted={}", - stateKey, - state.isCompleted()); - return state; + LOG.info("Completed rebuilding state, recovered {} states", actionStates.size()); } /** - * No-op for Fluss backend. Unlike the Kafka implementation which must consume from offsets to - * rebuild an in-memory HashMap, Fluss's KV table is inherently persistent. Each {@link #get} - * call performs a direct point lookup, so no in-memory state needs to be rebuilt. + * Returns null. Recovery always rebuilds from the Fluss log beginning, so no positional markers + * are needed in Flink operator state. */ @Override - public void rebuildState(List recoveryMarkers) { - LOG.info( - "Fluss backend: rebuildState is a no-op (KV table is persistent). " - + "Recovery markers: {}", - recoveryMarkers.size()); + public Object getRecoveryMarker() { + return null; } + /** + * Evicts pruned states from the in-memory cache. The Fluss log is append-only; physical cleanup + * relies on Fluss log retention configuration. + */ @Override public void pruneState(Object key, long seqNum) { - LOG.debug("Pruning state for key: {} up to sequence number: {}", key, seqNum); + LOG.debug("Pruning in-memory state for key: {} up to seqNum: {}", key, seqNum); - Map stateKeys = agentKeyToStateKeys.get(key.toString()); - if (stateKeys == null) { - return; - } - - stateKeys + actionStates .entrySet() .removeIf( entry -> { - if (entry.getValue() <= seqNum) { + String stateKey = entry.getKey(); + if (stateKey.startsWith(key.toString() + "_")) { try { - GenericRow deleteRow = new GenericRow(NUM_COLUMNS); - deleteRow.setField( - COL_STATE_KEY, BinaryString.fromString(entry.getKey())); - writer.delete(deleteRow).get(); - LOG.debug("Pruned state key: {}", entry.getKey()); - } catch (Exception e) { - LOG.warn( - "Failed to delete state key during pruning: {}", - entry.getKey(), - e); + List parts = ActionStateUtil.parseKey(stateKey); + if (parts.size() >= 2) { + long stateSeqNum = Long.parseLong(parts.get(1)); + return stateSeqNum <= seqNum; + } + } catch (NumberFormatException e) { + LOG.warn("Failed to parse seqNum from state key: {}", stateKey); } - return true; } return false; }); - // Remove agent key entry if no more state keys - if (stateKeys.isEmpty()) { - agentKeyToStateKeys.remove(key.toString()); - } - - // Flush to ensure deletes are visible to the lookuper - try { - writer.flush(); - } catch (Exception e) { - LOG.warn("Failed to flush writer after pruning", e); - } - - LOG.debug("Pruned state for key: {} up to sequence number: {}", key, seqNum); - } - - /** - * Returns null because the Fluss KV table is persistent and does not need recovery markers. - * Unlike Kafka which uses partition end offsets as markers, Fluss's point lookup works directly - * without any positional information. - */ - @Override - public Object getRecoveryMarker() { - return null; + LOG.debug("Pruned in-memory state for key: {} up to seqNum: {}", key, seqNum); } @Override @@ -349,14 +335,13 @@ public void close() throws Exception { private void maybeCreateDatabaseAndTable() { try (Admin admin = connection.getAdmin()) { - // Create database if not exists if (!admin.databaseExists(databaseName).get()) { admin.createDatabase(databaseName, DatabaseDescriptor.EMPTY, true).get(); LOG.info("Created Fluss database: {}", databaseName); } - // Create table if not exists if (!admin.tableExists(tablePath).get()) { + // No primaryKey() call — this creates an append-only log table in Fluss. Schema schema = Schema.newBuilder() .column("state_key", DataTypes.STRING()) @@ -365,7 +350,6 @@ private void maybeCreateDatabaseAndTable() { .column("agent_key", DataTypes.STRING()) .column("completed", DataTypes.BOOLEAN()) .column("updated_at", DataTypes.BIGINT()) - .primaryKey("state_key") .build(); int buckets = agentConfiguration.get(FLUSS_ACTION_STATE_TABLE_BUCKETS); @@ -373,11 +357,11 @@ private void maybeCreateDatabaseAndTable() { TableDescriptor.builder() .schema(schema) .distributedBy(buckets, "state_key") - .comment("Flink Agents action state store") + .comment("Flink Agents action state log") .build(); admin.createTable(tablePath, descriptor, true).get(); - LOG.info("Created Fluss table: {}", tablePath); + LOG.info("Created Fluss log table: {}", tablePath); } else { LOG.info("Fluss table {} already exists", tablePath); } From ffd7277b657af2788b9e6fb88b3d29fc981bb926 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Mon, 27 Apr 2026 15:12:35 +0800 Subject: [PATCH 3/9] Optimized FlussActionStateStore recovery mechanism and adjusted configuration. --- .../api/configuration/AgentConfigOptions.java | 2 +- docs/content/docs/operations/configuration.md | 4 +- pom.xml | 1 + runtime/pom.xml | 14 -- .../actionstate/FlussActionStateStore.java | 85 ++++++--- .../actionstate/FlussActionStateStoreIT.java | 165 +++++------------- .../FlussActionStateStoreTest.java | 136 +++++++++++++++ 7 files changed, 244 insertions(+), 163 deletions(-) create mode 100644 runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java diff --git a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java index 03858c7c2..45342121f 100644 --- a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java +++ b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java @@ -65,7 +65,7 @@ public class AgentConfigOptions { /** The config parameter specifies the number of buckets for the Fluss action state table. */ public static final ConfigOption FLUSS_ACTION_STATE_TABLE_BUCKETS = - new ConfigOption<>("flussActionStateTableBuckets", Integer.class, 8); + new ConfigOption<>("flussActionStateTableBuckets", Integer.class, 64); /** The config parameter specifies the authentication protocol for Fluss client. */ public static final ConfigOption FLUSS_SECURITY_PROTOCOL = diff --git a/docs/content/docs/operations/configuration.md b/docs/content/docs/operations/configuration.md index e55078c6a..55b54e756 100644 --- a/docs/content/docs/operations/configuration.md +++ b/docs/content/docs/operations/configuration.md @@ -159,14 +159,14 @@ Here are the configuration options for Kafka-based Action State Store. #### Fluss-based Action State Store -Here are the configuration options for Fluss-based Action State Store. Fluss provides O(1) KV point lookup for action state, eliminating the need for in-memory state rebuilding during recovery. +Here are the configuration options for Fluss-based Action State Store. | Key | Default | Type | Description | |------------------------------|------------------|---------|------------------------------------------------------------------------------------------| | `flussBootstrapServers` | "localhost:9123" | String | The Fluss bootstrap servers address. | | `flussActionStateDatabase` | "flink_agents" | String | The Fluss database name for storing action state. | | `flussActionStateTable` | "action_state" | String | The Fluss table name for storing action state. | -| `flussActionStateTableBuckets` | 8 | Integer | The number of buckets for the Fluss action state table. | +| `flussActionStateTableBuckets` | 64 | Integer | The number of buckets for the Fluss action state table. | | `flussSecurityProtocol` | "PLAINTEXT" | String | The authentication protocol for Fluss client (e.g., `PLAINTEXT`, `SASL_PLAIN`). | | `flussSaslMechanism` | "PLAIN" | String | The SASL mechanism for Fluss authentication. | | `flussSaslJaasConfig` | (none) | String | The JAAS configuration string for Fluss SASL authentication. | diff --git a/pom.xml b/pom.xml index 416a41f81..e77b0a51c 100644 --- a/pom.xml +++ b/pom.xml @@ -310,6 +310,7 @@ under the License. --add-opens=java.base/java.util=ALL-UNNAMED + --add-opens=java.base/java.nio=ALL-UNNAMED --add-exports=java.base/jdk.internal.vm=ALL-UNNAMED diff --git a/runtime/pom.xml b/runtime/pom.xml index c2ee09449..b37a44550 100644 --- a/runtime/pom.xml +++ b/runtime/pom.xml @@ -136,20 +136,6 @@ under the License. test-jar test - - org.apache.fluss - fluss-common - ${fluss.version} - test-jar - test - - - org.apache.fluss - fluss-rpc - ${fluss.version} - test-jar - test - org.apache.fluss fluss-test-utils diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java index cfd2afbd3..39eeba914 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -27,6 +27,7 @@ import org.apache.fluss.client.Connection; import org.apache.fluss.client.ConnectionFactory; import org.apache.fluss.client.admin.Admin; +import org.apache.fluss.client.admin.OffsetSpec; import org.apache.fluss.client.table.Table; import org.apache.fluss.client.table.scanner.ScanRecord; import org.apache.fluss.client.table.scanner.log.LogScanner; @@ -46,6 +47,7 @@ import org.slf4j.LoggerFactory; import java.time.Duration; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -71,7 +73,8 @@ *
    *
  • Read: O(1) in-memory HashMap lookup; no network I/O per query. *
  • Write: Append to log + update in-memory HashMap. - *
  • Recovery: Scan log from beginning to rebuild in-memory state. + *
  • Recovery: Scan log from recovery markers (or from beginning if none) to rebuild + * in-memory state. *
  • Prune: In-memory eviction only; physical cleanup relies on Fluss log retention. *
* @@ -80,11 +83,8 @@ *
  * state_key     STRING  -- Generated by ActionStateUtil.generateKey()
  * state_payload BYTES   -- ActionState JSON serialized bytes
- * seq_num       BIGINT  -- Sequence number for pruneState()
  * agent_key     STRING  -- Original key; used as bucket key so all records for the same
  *                          agent are co-located in the same bucket
- * completed     BOOLEAN -- Whether action completed
- * updated_at    BIGINT  -- Last update timestamp
  * 
*/ public class FlussActionStateStore implements ActionStateStore { @@ -118,11 +118,8 @@ abstract static class EventTypeInfoMixin {} // Column indices in the Fluss table schema private static final int COL_STATE_KEY = 0; private static final int COL_STATE_PAYLOAD = 1; - private static final int COL_SEQ_NUM = 2; - private static final int COL_AGENT_KEY = 3; - private static final int COL_COMPLETED = 4; - private static final int COL_UPDATED_AT = 5; - private static final int NUM_COLUMNS = 6; + private static final int COL_AGENT_KEY = 2; + private static final int NUM_COLUMNS = 3; private final AgentConfiguration agentConfiguration; private final String databaseName; @@ -187,10 +184,7 @@ public void put(Object key, long seqNum, Action action, Event event, ActionState GenericRow row = new GenericRow(NUM_COLUMNS); row.setField(COL_STATE_KEY, BinaryString.fromString(stateKey)); row.setField(COL_STATE_PAYLOAD, payload); - row.setField(COL_SEQ_NUM, seqNum); row.setField(COL_AGENT_KEY, BinaryString.fromString(key.toString())); - row.setField(COL_COMPLETED, state.isCompleted()); - row.setField(COL_UPDATED_AT, System.currentTimeMillis()); // Append to Fluss log for durability, then update in-memory cache. // Synchronous write ensures the record is durable before returning. @@ -209,19 +203,50 @@ public ActionState get(Object key, long seqNum, Action action, Event event) thro } /** - * Rebuilds in-memory state by scanning the Fluss log table from the beginning. Reads until the - * log is exhausted ({@link #EMPTY_POLL_THRESHOLD} consecutive empty polls). For the same state - * key appearing multiple times in the log, the latest record wins (last-write-wins). + * Rebuilds in-memory state by scanning the Fluss log table. If recovery markers are provided, + * computes the minimum offset per bucket across all markers and subscribes from those offsets. + * Otherwise, subscribes from the beginning. Reads until the log is exhausted ({@link + * #EMPTY_POLL_THRESHOLD} consecutive empty polls). For the same state key appearing multiple + * times in the log, the latest record wins (last-write-wins). + * + *

Note: Unlike the Kafka implementation which skips rebuild on empty markers, this + * implementation scans from the beginning when no markers are provided, enabling full state + * recovery even without prior checkpoints. */ @Override public void rebuildState(List recoveryMarkers) { - LOG.info("Rebuilding action state from Fluss log table"); + LOG.info( + "Rebuilding action state from Fluss log table with {} recovery markers", + recoveryMarkers.size()); actionStates.clear(); int numBuckets = table.getTableInfo().getNumBuckets(); + + // Compute per-bucket start offsets from recovery markers + Map bucketOffsets = new HashMap<>(); + for (Object marker : recoveryMarkers) { + if (marker instanceof Map) { + @SuppressWarnings("unchecked") + Map markerMap = (Map) marker; + for (Map.Entry entry : markerMap.entrySet()) { + bucketOffsets.merge(entry.getKey(), entry.getValue(), Math::min); + } + } else if (marker != null) { + LOG.warn( + "Ignoring unrecognized recovery marker type: {}", + marker.getClass().getName()); + } + } + try (LogScanner scanner = table.newScan().createLogScanner()) { - for (int b = 0; b < numBuckets; b++) { - scanner.subscribeFromBeginning(b); + if (bucketOffsets.isEmpty()) { + for (int b = 0; b < numBuckets; b++) { + scanner.subscribeFromBeginning(b); + } + } else { + for (Map.Entry entry : bucketOffsets.entrySet()) { + scanner.subscribe(entry.getKey(), entry.getValue()); + } } int emptyPollCount = 0; @@ -258,12 +283,25 @@ public void rebuildState(List recoveryMarkers) { } /** - * Returns null. Recovery always rebuilds from the Fluss log beginning, so no positional markers - * are needed in Flink operator state. + * Returns the end offsets of each bucket as a recovery marker. Similar to Kafka's + * implementation, this captures the current log position so that {@link #rebuildState} can + * resume from these offsets instead of scanning from the beginning. */ @Override public Object getRecoveryMarker() { - return null; + int numBuckets = table.getTableInfo().getNumBuckets(); + + try (Admin admin = connection.getAdmin()) { + List buckets = new ArrayList<>(); + for (int b = 0; b < numBuckets; b++) { + buckets.add(b); + } + return new HashMap<>( + admin.listOffsets(tablePath, buckets, new OffsetSpec.LatestSpec()).all().get()); + } catch (Exception e) { + LOG.error("Failed to get end offsets for Fluss table: {}", tablePath, e); + throw new RuntimeException("Failed to get end offsets for Fluss table", e); + } } /** @@ -346,17 +384,14 @@ private void maybeCreateDatabaseAndTable() { Schema.newBuilder() .column("state_key", DataTypes.STRING()) .column("state_payload", DataTypes.BYTES()) - .column("seq_num", DataTypes.BIGINT()) .column("agent_key", DataTypes.STRING()) - .column("completed", DataTypes.BOOLEAN()) - .column("updated_at", DataTypes.BIGINT()) .build(); int buckets = agentConfiguration.get(FLUSS_ACTION_STATE_TABLE_BUCKETS); TableDescriptor descriptor = TableDescriptor.builder() .schema(schema) - .distributedBy(buckets, "state_key") + .distributedBy(buckets, "agent_key") .comment("Flink Agents action state log") .build(); diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java index 844a4c195..f830752a2 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java @@ -32,9 +32,9 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; -import java.nio.charset.StandardCharsets; import java.util.Collections; import java.util.List; +import java.util.Map; import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_DATABASE; import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE; @@ -140,9 +140,9 @@ void testPruneSingleKey() throws Exception { store.pruneState(TEST_KEY, 2L); - // Delete may take time to be applied from changelog to KV store - waitUntilDeleted(() -> store.get(TEST_KEY, 1L, testAction, testEvent)); - waitUntilDeleted(() -> store.get(TEST_KEY, 2L, testAction, testEvent)); + // pruneState is synchronous (in-memory eviction) + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); + assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNull(); assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); } @@ -159,62 +159,62 @@ void testPruneMultipleAgentKeys() throws Exception { // Prune only keyA up to seqNum 2 store.pruneState(keyA, 2L); - // keyA entries pruned (may take time to propagate) - waitUntilDeleted(() -> store.get(keyA, 1L, testAction, testEvent)); - waitUntilDeleted(() -> store.get(keyA, 2L, testAction, testEvent)); + // pruneState is synchronous (in-memory eviction) + assertThat(store.get(keyA, 1L, testAction, testEvent)).isNull(); + assertThat(store.get(keyA, 2L, testAction, testEvent)).isNull(); // keyB entries unaffected assertThat(store.get(keyB, 1L, testAction, testEvent)).isNotNull(); assertThat(store.get(keyB, 2L, testAction, testEvent)).isNotNull(); } - // ==================== Fluss-Specific Behavior ==================== + // ==================== Recovery ==================== @Test - void testRecoveryMarkerReturnsNull() { + @SuppressWarnings("unchecked") + void testRecoveryMarkerReturnsBucketOffsets() { Object marker = store.getRecoveryMarker(); - assertThat(marker).isNull(); + assertThat(marker).isNotNull(); + assertThat(marker).isInstanceOf(Map.class); + Map bucketOffsets = (Map) marker; + // With 1 bucket configured, we should have exactly 1 entry + assertThat(bucketOffsets).hasSize(1); + assertThat(bucketOffsets).containsKey(0); + assertThat(bucketOffsets.get(0)).isGreaterThanOrEqualTo(0L); } @Test - void testRebuildStateNoOp() throws Exception { + void testRebuildStateFromBeginning() throws Exception { store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); - // rebuildState should complete without error + // rebuildState with empty markers should scan from beginning store.rebuildState(Collections.emptyList()); - // Data should still be accessible assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); } - // ==================== Durable Execution Patterns ==================== - @Test - void testCompletionOnlyFlow() throws Exception { - ActionState state = new ActionState(testEvent); - CallResult successResult = - new CallResult( - "myFunction", - "argsDigest123", - "result-payload".getBytes(StandardCharsets.UTF_8)); - state.addCallResult(successResult); + @SuppressWarnings("unchecked") + void testRebuildStateWithRecoveryMarkers() throws Exception { + store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); - store.put(TEST_KEY, 1L, testAction, testEvent, state); - ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + // Capture recovery marker after writing data + Object marker = store.getRecoveryMarker(); - assertThat(retrieved).isNotNull(); - assertThat(retrieved.getCallResults()).hasSize(1); - CallResult retrievedResult = retrieved.getCallResult(0); - assertThat(retrievedResult.getFunctionId()).isEqualTo("myFunction"); - assertThat(retrievedResult.getArgsDigest()).isEqualTo("argsDigest123"); - assertThat(retrievedResult.isSuccess()).isTrue(); - assertThat(retrievedResult.getResultPayload()) - .isEqualTo("result-payload".getBytes(StandardCharsets.UTF_8)); + // Write more data after the marker + store.put(TEST_KEY, 2L, testAction, testEvent, new ActionState(testEvent)); + + // Rebuild using the marker; should replay from marker offset and recover data after it + store.rebuildState(List.of(marker)); + + // Data written before the marker should NOT be in the rebuilt cache + // (rebuildState clears the cache and only replays from the marker offset) + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); + // Data written after the marker should be recovered + assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); } @Test void testReconcilePendingPersistence() throws Exception { - // Simulate Reconcile pattern: write PENDING state, close store, reopen, verify PENDING - // survives ActionState state = new ActionState(testEvent); CallResult pending = CallResult.pending("sideEffectFn", "digest456"); state.addCallResult(pending); @@ -226,6 +226,8 @@ void testReconcilePendingPersistence() throws Exception { FlussActionStateStore recoveredStore = new FlussActionStateStore(createAgentConfiguration()); try { + // Rebuild state from the log + recoveredStore.rebuildState(Collections.emptyList()); ActionState recovered = recoveredStore.get(TEST_KEY, 1L, testAction, testEvent); assertThat(recovered).isNotNull(); @@ -241,68 +243,33 @@ void testReconcilePendingPersistence() throws Exception { @Test void testPruneWorksAfterRecovery() throws Exception { - // This tests the critical bug fix: after close+reopen, the in-memory - // agentKeyToStateKeys mapping is empty. The get() method must rebuild - // this mapping so that pruneState() can still delete old records. store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); store.put(TEST_KEY, 2L, testAction, testEvent, new ActionState(testEvent)); store.put(TEST_KEY, 3L, testAction, testEvent, new ActionState(testEvent)); store.close(); - // Simulate recovery: new store instance, mapping is empty + // Simulate recovery: new store instance FlussActionStateStore recoveredStore = new FlussActionStateStore(createAgentConfiguration()); try { - // get() calls rebuild the mapping + // Rebuild state from the log + recoveredStore.rebuildState(Collections.emptyList()); + assertThat(recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); assertThat(recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); assertThat(recoveredStore.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); - // pruneState should now work because get() rebuilt the mapping recoveredStore.pruneState(TEST_KEY, 2L); - waitUntilDeleted(() -> recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)); - waitUntilDeleted(() -> recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)); + // pruneState is synchronous (in-memory eviction) + assertThat(recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); + assertThat(recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)).isNull(); assertThat(recoveredStore.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); } finally { recoveredStore.close(); } } - @Test - void testMultipleCallResults() throws Exception { - ActionState state = new ActionState(testEvent); - state.addCallResult( - new CallResult("fn1", "d1", "ok".getBytes(StandardCharsets.UTF_8))); // SUCCEEDED - state.addCallResult(CallResult.pending("fn2", "d2")); // PENDING - state.addCallResult( - new CallResult( - "fn3", "d3", null, "error".getBytes(StandardCharsets.UTF_8))); // FAILED - - store.put(TEST_KEY, 1L, testAction, testEvent, state); - ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); - - assertThat(retrieved).isNotNull(); - assertThat(retrieved.getCallResults()).hasSize(3); - assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); - assertThat(retrieved.getCallResult(1).isPending()).isTrue(); - assertThat(retrieved.getCallResult(2).isFailure()).isTrue(); - } - - @Test - void testCompletedAction() throws Exception { - ActionState state = new ActionState(testEvent); - state.addCallResult(new CallResult("fn1", "d1", "result".getBytes(StandardCharsets.UTF_8))); - state.markCompleted(); - - store.put(TEST_KEY, 1L, testAction, testEvent, state); - ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); - - assertThat(retrieved).isNotNull(); - assertThat(retrieved.isCompleted()).isTrue(); - assertThat(retrieved.getCallResults()).isEmpty(); - } - // ==================== Infrastructure ==================== @Test @@ -326,30 +293,6 @@ void testCloseIdempotent() throws Exception { store = null; } - // ==================== Serialization ==================== - - @Test - void testFullSerializationRoundTrip() throws Exception { - InputEvent taskEvent = new InputEvent("full-test-data"); - ActionState state = new ActionState(taskEvent); - state.addEvent(new InputEvent("output-event-1")); - state.addEvent(new InputEvent("output-event-2")); - state.addCallResult( - new CallResult("fn1", "digest1", "payload1".getBytes(StandardCharsets.UTF_8))); - state.addCallResult(CallResult.pending("fn2", "digest2")); - - store.put(TEST_KEY, 1L, testAction, testEvent, state); - ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); - - assertThat(retrieved).isNotNull(); - assertThat(retrieved.getTaskEvent()).isEqualTo(taskEvent); - assertThat(retrieved.getOutputEvents()).hasSize(2); - assertThat(retrieved.getCallResults()).hasSize(2); - assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); - assertThat(retrieved.getCallResult(1).isPending()).isTrue(); - assertThat(retrieved.isCompleted()).isFalse(); - } - // ==================== Helpers ==================== private AgentConfiguration createAgentConfiguration() { @@ -388,24 +331,4 @@ public TestAction(String name) throws Exception { List.of(InputEvent.class.getName())); } } - - @FunctionalInterface - private interface ThrowingSupplier { - T get() throws Exception; - } - - /** - * Polls until the supplier returns null, with a timeout. Fluss KV deletes may take time to - * propagate from the changelog to the RocksDB KV store. - */ - private static void waitUntilDeleted(ThrowingSupplier supplier) throws Exception { - long deadline = System.currentTimeMillis() + 30_000; - while (System.currentTimeMillis() < deadline) { - if (supplier.get() == null) { - return; - } - Thread.sleep(200); - } - assertThat(supplier.get()).isNull(); - } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java new file mode 100644 index 000000000..d2f948f40 --- /dev/null +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.agents.runtime.actionstate; + +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.flink.agents.api.Event; +import org.apache.flink.agents.api.InputEvent; +import org.apache.flink.agents.api.OutputEvent; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.nio.charset.StandardCharsets; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Unit tests for {@link FlussActionStateStore} serialization and in-memory behavior. These tests + * verify the ObjectMapper configuration and ActionState serialization round-trips without requiring + * a real Fluss cluster. + */ +public class FlussActionStateStoreTest { + + private ObjectMapper objectMapper; + + @JsonTypeInfo( + use = JsonTypeInfo.Id.CLASS, + include = JsonTypeInfo.As.PROPERTY, + property = "@class") + abstract static class EventTypeInfoMixin {} + + @BeforeEach + void setUp() { + objectMapper = new ObjectMapper(); + objectMapper.addMixIn(Event.class, EventTypeInfoMixin.class); + objectMapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); + objectMapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); + } + + @Test + void testCompletionOnlyFlow() throws Exception { + InputEvent taskEvent = new InputEvent("test-data"); + ActionState state = new ActionState(taskEvent); + CallResult successResult = + new CallResult( + "myFunction", + "argsDigest123", + "result-payload".getBytes(StandardCharsets.UTF_8)); + state.addCallResult(successResult); + + byte[] payload = objectMapper.writeValueAsBytes(state); + ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getCallResults()).hasSize(1); + CallResult retrievedResult = retrieved.getCallResult(0); + assertThat(retrievedResult.getFunctionId()).isEqualTo("myFunction"); + assertThat(retrievedResult.getArgsDigest()).isEqualTo("argsDigest123"); + assertThat(retrievedResult.isSuccess()).isTrue(); + assertThat(retrievedResult.getResultPayload()) + .isEqualTo("result-payload".getBytes(StandardCharsets.UTF_8)); + } + + @Test + void testMultipleCallResults() throws Exception { + InputEvent taskEvent = new InputEvent("test-data"); + ActionState state = new ActionState(taskEvent); + state.addCallResult( + new CallResult("fn1", "d1", "ok".getBytes(StandardCharsets.UTF_8))); // SUCCEEDED + state.addCallResult(CallResult.pending("fn2", "d2")); // PENDING + state.addCallResult( + new CallResult( + "fn3", "d3", null, "error".getBytes(StandardCharsets.UTF_8))); // FAILED + + byte[] payload = objectMapper.writeValueAsBytes(state); + ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getCallResults()).hasSize(3); + assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); + assertThat(retrieved.getCallResult(1).isPending()).isTrue(); + assertThat(retrieved.getCallResult(2).isFailure()).isTrue(); + } + + @Test + void testCompletedAction() throws Exception { + InputEvent taskEvent = new InputEvent("test-data"); + ActionState state = new ActionState(taskEvent); + state.addCallResult(new CallResult("fn1", "d1", "result".getBytes(StandardCharsets.UTF_8))); + state.markCompleted(); + + byte[] payload = objectMapper.writeValueAsBytes(state); + ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.isCompleted()).isTrue(); + assertThat(retrieved.getCallResults()).isEmpty(); + } + + @Test + void testFullSerializationRoundTrip() throws Exception { + InputEvent taskEvent = new InputEvent("full-test-data"); + ActionState state = new ActionState(taskEvent); + state.addEvent(new InputEvent("output-event-1")); + state.addEvent(new InputEvent("output-event-2")); + state.addCallResult( + new CallResult("fn1", "digest1", "payload1".getBytes(StandardCharsets.UTF_8))); + state.addCallResult(CallResult.pending("fn2", "digest2")); + + byte[] payload = objectMapper.writeValueAsBytes(state); + ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getTaskEvent()).isEqualTo(taskEvent); + assertThat(retrieved.getOutputEvents()).hasSize(2); + assertThat(retrieved.getCallResults()).hasSize(2); + assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); + assertThat(retrieved.getCallResult(1).isPending()).isTrue(); + assertThat(retrieved.isCompleted()).isFalse(); + } +} From 66a9ada44e63978c5b4e90e4c5f820a423e78c1c Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Tue, 28 Apr 2026 16:30:24 +0800 Subject: [PATCH 4/9] refactor(runtime): Remove the old Kafka serialization class and use ActionStateSerialization uniformly --- .../api/configuration/AgentConfigOptions.java | 2 +- docs/content/docs/operations/configuration.md | 2 +- .../ActionStateKafkaDeserializer.java | 110 ------ .../actionstate/ActionStateKafkaSeder.java | 85 +--- ...aSerializer.java => ActionStateSerde.java} | 51 +-- .../actionstate/FlussActionStateStore.java | 369 +++++++++++------- .../actionstate/KafkaActionStateStore.java | 5 +- .../actionstate/ActionStateSerdeTest.java | 77 ++-- .../actionstate/ActionStateUtilTest.java | 20 - .../actionstate/FlussActionStateStoreIT.java | 86 ++-- .../FlussActionStateStoreTest.java | 205 ++++++---- .../KafkaActionStateStoreTest.java | 19 - .../runtime/actionstate/TestAction.java | 44 +++ 13 files changed, 515 insertions(+), 560 deletions(-) delete mode 100644 runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaDeserializer.java rename runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/{ActionStateKafkaSerializer.java => ActionStateSerde.java} (67%) create mode 100644 runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/TestAction.java diff --git a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java index 45342121f..441b414e1 100644 --- a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java +++ b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java @@ -61,7 +61,7 @@ public class AgentConfigOptions { /** The config parameter specifies the Fluss table name for action state. */ public static final ConfigOption FLUSS_ACTION_STATE_TABLE = - new ConfigOption<>("flussActionStateTable", String.class, "action_state"); + new ConfigOption<>("flussActionStateTable", String.class, "action_states"); /** The config parameter specifies the number of buckets for the Fluss action state table. */ public static final ConfigOption FLUSS_ACTION_STATE_TABLE_BUCKETS = diff --git a/docs/content/docs/operations/configuration.md b/docs/content/docs/operations/configuration.md index 55b54e756..757eb2649 100644 --- a/docs/content/docs/operations/configuration.md +++ b/docs/content/docs/operations/configuration.md @@ -165,7 +165,7 @@ Here are the configuration options for Fluss-based Action State Store. |------------------------------|------------------|---------|------------------------------------------------------------------------------------------| | `flussBootstrapServers` | "localhost:9123" | String | The Fluss bootstrap servers address. | | `flussActionStateDatabase` | "flink_agents" | String | The Fluss database name for storing action state. | -| `flussActionStateTable` | "action_state" | String | The Fluss table name for storing action state. | +| `flussActionStateTable` | "action_states" | String | The Fluss table name for storing action state. | | `flussActionStateTableBuckets` | 64 | Integer | The number of buckets for the Fluss action state table. | | `flussSecurityProtocol` | "PLAINTEXT" | String | The authentication protocol for Fluss client (e.g., `PLAINTEXT`, `SASL_PLAIN`). | | `flussSaslMechanism` | "PLAIN" | String | The SASL mechanism for Fluss authentication. | diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaDeserializer.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaDeserializer.java deleted file mode 100644 index 38e7b6ad7..000000000 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaDeserializer.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.flink.agents.runtime.actionstate; - -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.databind.DeserializationContext; -import com.fasterxml.jackson.databind.JsonDeserializer; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.flink.agents.api.Event; -import org.apache.flink.agents.api.InputEvent; -import org.apache.flink.agents.api.OutputEvent; -import org.apache.flink.agents.runtime.operator.ActionTask; -import org.apache.kafka.common.serialization.Deserializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.util.Map; - -/** - * Kafka deserializer for {@link ActionState}. - * - *

This deserializer handles the deserialization of byte arrays from Kafka back to ActionState - * instances. It uses Jackson ObjectMapper with custom deserializers to handle polymorphic Event - * types and ensures ActionTask is deserialized as null. - */ -public class ActionStateKafkaDeserializer implements Deserializer { - - private static final Logger LOG = LoggerFactory.getLogger(ActionStateKafkaDeserializer.class); - private static final ObjectMapper OBJECT_MAPPER = createObjectMapper(); - - @Override - public void configure(Map configs, boolean isKey) { - // No configuration needed - } - - @Override - public ActionState deserialize(String topic, byte[] data) { - if (data == null) { - return null; - } - - try { - return OBJECT_MAPPER.readValue(data, ActionState.class); - } catch (Exception e) { - LOG.error("Failed to deserialize ActionState for topic: {}", topic, e); - throw new RuntimeException("Failed to deserialize ActionState", e); - } - } - - @Override - public void close() { - // No resources to close - } - - /** Creates and configures the ObjectMapper for ActionState deserialization. */ - private static ObjectMapper createObjectMapper() { - ObjectMapper mapper = new ObjectMapper(); - - // Add type information for polymorphic Event deserialization - mapper.addMixIn(Event.class, EventTypeInfoMixin.class); - mapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); - mapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); - - // Create a module for custom deserializers - SimpleModule module = new SimpleModule(); - - // Custom deserializer for ActionTask - always deserialize as null - module.addDeserializer(ActionTask.class, new ActionTaskDeserializer()); - - mapper.registerModule(module); - - return mapper; - } - - /** Mixin to add type information for Event hierarchy. */ - @JsonTypeInfo( - use = JsonTypeInfo.Id.CLASS, - include = JsonTypeInfo.As.PROPERTY, - property = "@class") - public abstract static class EventTypeInfoMixin {} - - /** Custom deserializer for ActionTask that always deserializes as null. */ - public static class ActionTaskDeserializer extends JsonDeserializer { - @Override - public ActionTask deserialize(JsonParser p, DeserializationContext ctxt) - throws IOException { - // Skip the value and return null - p.skipChildren(); - return null; - } - } -} diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaSeder.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaSeder.java index b225eb09c..9af3dc3f6 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaSeder.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaSeder.java @@ -17,107 +17,34 @@ */ package org.apache.flink.agents.runtime.actionstate; -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.databind.JsonSerializer; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializerProvider; -import com.fasterxml.jackson.databind.module.SimpleModule; -import org.apache.flink.agents.api.Event; -import org.apache.flink.agents.api.InputEvent; -import org.apache.flink.agents.api.OutputEvent; -import org.apache.flink.agents.runtime.operator.ActionTask; import org.apache.kafka.common.serialization.Deserializer; import org.apache.kafka.common.serialization.Serializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import java.io.IOException; import java.util.Map; /** - * Kafka serializer for {@link ActionState}. - * - *

This serializer handles the serialization of ActionState instances to byte arrays for storage - * in Kafka. It uses Jackson ObjectMapper with custom serializers to handle polymorphic Event types - * and ensures ActionTask is serialized as null. + * Kafka serializer/deserializer for {@link ActionState}. Delegates to {@link ActionStateSerde} for + * the actual serialization logic. */ public class ActionStateKafkaSeder implements Serializer, Deserializer { - private static final Logger LOG = LoggerFactory.getLogger(ActionStateKafkaSeder.class); - private static final ObjectMapper OBJECT_MAPPER = createObjectMapper(); - @Override public void configure(Map configs, boolean isKey) { // No configuration needed } @Override - public ActionState deserialize(String topic, byte[] data) { - if (data == null) { - return null; - } - - try { - return OBJECT_MAPPER.readValue(data, ActionState.class); - } catch (Exception e) { - LOG.error("Failed to deserialize ActionState for topic: {}", topic, e); - throw new RuntimeException("Failed to deserialize ActionState", e); - } + public byte[] serialize(String topic, ActionState data) { + return data == null ? null : ActionStateSerde.serialize(data); } @Override - public byte[] serialize(String topic, ActionState data) { - if (data == null) { - return null; - } - - try { - return OBJECT_MAPPER.writeValueAsBytes(data); - } catch (Exception e) { - LOG.error("Failed to serialize ActionState for topic: {}", topic, e); - throw new RuntimeException("Failed to serialize ActionState", e); - } + public ActionState deserialize(String topic, byte[] data) { + return data == null ? null : ActionStateSerde.deserialize(data); } @Override public void close() { // No resources to close } - - /** Creates and configures the ObjectMapper for ActionState serialization. */ - private static ObjectMapper createObjectMapper() { - ObjectMapper mapper = new ObjectMapper(); - - // Add type information for polymorphic Event deserialization - mapper.addMixIn(Event.class, EventTypeInfoMixin.class); - mapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); - mapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); - - // Create a module for custom serializers - SimpleModule module = new SimpleModule(); - - // Custom serializer for ActionTask - always serialize as null - module.addSerializer(ActionTask.class, new ActionTaskSerializer()); - - mapper.registerModule(module); - - return mapper; - } - - /** Mixin to add type information for Event hierarchy. */ - @JsonTypeInfo( - use = JsonTypeInfo.Id.CLASS, - include = JsonTypeInfo.As.PROPERTY, - property = "@class") - public abstract static class EventTypeInfoMixin {} - - /** Custom serializer for ActionTask that always serializes as null. */ - public static class ActionTaskSerializer extends JsonSerializer { - @Override - public void serialize(ActionTask value, JsonGenerator gen, SerializerProvider serializers) - throws IOException { - gen.writeNull(); - } - } } diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaSerializer.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateSerde.java similarity index 67% rename from runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaSerializer.java rename to runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateSerde.java index 881c5293c..02c9a06b3 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateKafkaSerializer.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/ActionStateSerde.java @@ -27,50 +27,40 @@ import org.apache.flink.agents.api.InputEvent; import org.apache.flink.agents.api.OutputEvent; import org.apache.flink.agents.runtime.operator.ActionTask; -import org.apache.kafka.common.serialization.Serializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.Map; /** - * Kafka serializer for {@link ActionState}. + * Backend-agnostic serializer/deserializer for {@link ActionState}. * - *

This serializer handles the serialization of ActionState instances to byte arrays for storage - * in Kafka. It uses Jackson ObjectMapper with custom serializers to handle polymorphic Event types - * and ensures ActionTask is serialized as null. + *

Uses Jackson {@link ObjectMapper} configured with polymorphic type information for the {@link + * Event} hierarchy and a custom null-serializer for {@link ActionTask}. Both Kafka and Fluss + * ActionStateStore backends delegate to this class for consistent serialization format. */ -public class ActionStateKafkaSerializer implements Serializer { +public final class ActionStateSerde { - private static final Logger LOG = LoggerFactory.getLogger(ActionStateKafkaSerializer.class); private static final ObjectMapper OBJECT_MAPPER = createObjectMapper(); - @Override - public void configure(Map configs, boolean isKey) { - // No configuration needed - } - - @Override - public byte[] serialize(String topic, ActionState data) { - if (data == null) { - return null; - } + private ActionStateSerde() {} + /** Serializes an {@link ActionState} to a JSON byte array. */ + public static byte[] serialize(ActionState state) { try { - return OBJECT_MAPPER.writeValueAsBytes(data); + return OBJECT_MAPPER.writeValueAsBytes(state); } catch (Exception e) { - LOG.error("Failed to serialize ActionState for topic: {}", topic, e); throw new RuntimeException("Failed to serialize ActionState", e); } } - @Override - public void close() { - // No resources to close + /** Deserializes an {@link ActionState} from a JSON byte array. */ + public static ActionState deserialize(byte[] data) { + try { + return OBJECT_MAPPER.readValue(data, ActionState.class); + } catch (Exception e) { + throw new RuntimeException("Failed to deserialize ActionState", e); + } } - /** Creates and configures the ObjectMapper for ActionState serialization. */ private static ObjectMapper createObjectMapper() { ObjectMapper mapper = new ObjectMapper(); @@ -79,12 +69,9 @@ private static ObjectMapper createObjectMapper() { mapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); mapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); - // Create a module for custom serializers - SimpleModule module = new SimpleModule(); - // Custom serializer for ActionTask - always serialize as null + SimpleModule module = new SimpleModule(); module.addSerializer(ActionTask.class, new ActionTaskSerializer()); - mapper.registerModule(module); return mapper; @@ -95,10 +82,10 @@ private static ObjectMapper createObjectMapper() { use = JsonTypeInfo.Id.CLASS, include = JsonTypeInfo.As.PROPERTY, property = "@class") - public abstract static class EventTypeInfoMixin {} + abstract static class EventTypeInfoMixin {} /** Custom serializer for ActionTask that always serializes as null. */ - public static class ActionTaskSerializer extends JsonSerializer { + static class ActionTaskSerializer extends JsonSerializer { @Override public void serialize(ActionTask value, JsonGenerator gen, SerializerProvider serializers) throws IOException { diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java index 39eeba914..d31900f06 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -17,13 +17,10 @@ */ package org.apache.flink.agents.runtime.actionstate; -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.flink.agents.api.Event; -import org.apache.flink.agents.api.InputEvent; -import org.apache.flink.agents.api.OutputEvent; import org.apache.flink.agents.plan.AgentConfiguration; import org.apache.flink.agents.plan.actions.Action; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.fluss.client.Connection; import org.apache.fluss.client.ConnectionFactory; import org.apache.fluss.client.admin.Admin; @@ -33,6 +30,7 @@ import org.apache.fluss.client.table.scanner.log.LogScanner; import org.apache.fluss.client.table.scanner.log.ScanRecords; import org.apache.fluss.client.table.writer.AppendWriter; +import org.apache.fluss.config.ConfigOptions; import org.apache.fluss.config.Configuration; import org.apache.fluss.metadata.DatabaseDescriptor; import org.apache.fluss.metadata.Schema; @@ -62,64 +60,48 @@ import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_USERNAME; import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SECURITY_PROTOCOL; import static org.apache.flink.agents.runtime.actionstate.ActionStateUtil.generateKey; +import static org.apache.fluss.config.ConfigOptions.BOOTSTRAP_SERVERS; +import static org.apache.fluss.config.ConfigOptions.CLIENT_SASL_JAAS_CONFIG; +import static org.apache.fluss.config.ConfigOptions.CLIENT_SASL_JAAS_PASSWORD; +import static org.apache.fluss.config.ConfigOptions.CLIENT_SASL_JAAS_USERNAME; +import static org.apache.fluss.config.ConfigOptions.CLIENT_SASL_MECHANISM; +import static org.apache.fluss.config.ConfigOptions.CLIENT_SECURITY_PROTOCOL; /** * An implementation of {@link ActionStateStore} that uses an Apache Fluss log table as the backend. - * All state is maintained in an in-memory HashMap for O(1) lookup, with the Fluss log table - * providing durability and recovery support. - * - *

Compared to the previous KV-table implementation: + * All state is maintained in an in-memory map for fast lookups, with the Fluss log table providing + * durability and recovery support. * *

    - *
  • Read: O(1) in-memory HashMap lookup; no network I/O per query. - *
  • Write: Append to log + update in-memory HashMap. - *
  • Recovery: Scan log from recovery markers (or from beginning if none) to rebuild - * in-memory state. + *
  • Write: Append to Fluss log + update in-memory cache (synchronous). + *
  • Read: In-memory lookup with divergence detection. + *
  • Recovery: Scan log from checkpoint offsets (recovery markers) to the latest offset. *
  • Prune: In-memory eviction only; physical cleanup relies on Fluss log retention. *
* - *

The Fluss log table schema (no primary key — this creates a log/append-only table): + *

The Fluss log table schema: * *

  * state_key     STRING  -- Generated by ActionStateUtil.generateKey()
  * state_payload BYTES   -- ActionState JSON serialized bytes
- * agent_key     STRING  -- Original key; used as bucket key so all records for the same
- *                          agent are co-located in the same bucket
+ * agent_key     STRING  -- Original input key; used as bucket key so all records for the same
+ *                          input key are co-located in the same bucket
  * 
*/ public class FlussActionStateStore implements ActionStateStore { private static final Logger LOG = LoggerFactory.getLogger(FlussActionStateStore.class); - private static final ObjectMapper OBJECT_MAPPER = createObjectMapper(); - - /** - * Number of consecutive empty polls before considering the log fully consumed during recovery. - */ - private static final int EMPTY_POLL_THRESHOLD = 3; private static final Duration POLL_TIMEOUT = Duration.ofSeconds(1); - /** Creates and configures the ObjectMapper for ActionState serialization. */ - private static ObjectMapper createObjectMapper() { - ObjectMapper mapper = new ObjectMapper(); - mapper.addMixIn(Event.class, EventTypeInfoMixin.class); - mapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); - mapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); - return mapper; - } - - /** Mixin to add type information for Event hierarchy. */ - @JsonTypeInfo( - use = JsonTypeInfo.Id.CLASS, - include = JsonTypeInfo.As.PROPERTY, - property = "@class") - abstract static class EventTypeInfoMixin {} + // Column names in the Fluss table schema + private static final String COL_NAME_STATE_KEY = "state_key"; + private static final String COL_NAME_STATE_PAYLOAD = "state_payload"; + private static final String COL_NAME_AGENT_KEY = "agent_key"; // Column indices in the Fluss table schema private static final int COL_STATE_KEY = 0; private static final int COL_STATE_PAYLOAD = 1; - private static final int COL_AGENT_KEY = 2; - private static final int NUM_COLUMNS = 3; private final AgentConfiguration agentConfiguration; private final String databaseName; @@ -133,6 +115,22 @@ abstract static class EventTypeInfoMixin {} /** In-memory cache for O(1) state lookups; rebuilt from Fluss log on recovery. */ private final Map actionStates; + @VisibleForTesting + FlussActionStateStore( + Map actionStates, + Connection connection, + Table table, + AppendWriter writer) { + this.agentConfiguration = null; + this.databaseName = null; + this.tableName = null; + this.tablePath = null; + this.actionStates = actionStates; + this.connection = connection; + this.table = table; + this.writer = writer; + } + public FlussActionStateStore(AgentConfiguration agentConfiguration) { this.agentConfiguration = agentConfiguration; this.databaseName = agentConfiguration.get(FLUSS_ACTION_STATE_DATABASE); @@ -141,27 +139,28 @@ public FlussActionStateStore(AgentConfiguration agentConfiguration) { this.actionStates = new HashMap<>(); Configuration flussConf = new Configuration(); - flussConf.setString("bootstrap.servers", agentConfiguration.get(FLUSS_BOOTSTRAP_SERVERS)); + flussConf.setString( + BOOTSTRAP_SERVERS.key(), agentConfiguration.get(FLUSS_BOOTSTRAP_SERVERS)); + // Minimize latency for synchronous put(): setting batch linger time to zero ensures + // that each append is sent immediately without waiting for additional records to batch. + flussConf.set(ConfigOptions.CLIENT_WRITER_BATCH_TIMEOUT, Duration.ZERO); + + flussConf.setString( + CLIENT_SECURITY_PROTOCOL, agentConfiguration.get(FLUSS_SECURITY_PROTOCOL)); + + flussConf.setString(CLIENT_SASL_MECHANISM, agentConfiguration.get(FLUSS_SASL_MECHANISM)); - String securityProtocol = agentConfiguration.get(FLUSS_SECURITY_PROTOCOL); - if (securityProtocol != null) { - flussConf.setString("client.security.protocol", securityProtocol); - } - String saslMechanism = agentConfiguration.get(FLUSS_SASL_MECHANISM); - if (saslMechanism != null) { - flussConf.setString("client.security.sasl.mechanism", saslMechanism); - } String jaasConfig = agentConfiguration.get(FLUSS_SASL_JAAS_CONFIG); if (jaasConfig != null) { - flussConf.setString("client.security.sasl.jaas.config", jaasConfig); + flussConf.setString(CLIENT_SASL_JAAS_CONFIG, jaasConfig); } String username = agentConfiguration.get(FLUSS_SASL_USERNAME); if (username != null) { - flussConf.setString("client.security.sasl.username", username); + flussConf.setString(CLIENT_SASL_JAAS_USERNAME, username); } String password = agentConfiguration.get(FLUSS_SASL_PASSWORD); if (password != null) { - flussConf.setString("client.security.sasl.password", password); + flussConf.setString(CLIENT_SASL_JAAS_PASSWORD, password); } this.connection = ConnectionFactory.createConnection(flussConf); @@ -179,12 +178,13 @@ public FlussActionStateStore(AgentConfiguration agentConfiguration) { public void put(Object key, long seqNum, Action action, Event event, ActionState state) throws Exception { String stateKey = generateKey(key, seqNum, action, event); - byte[] payload = OBJECT_MAPPER.writeValueAsBytes(state); + byte[] payload = ActionStateSerde.serialize(state); - GenericRow row = new GenericRow(NUM_COLUMNS); - row.setField(COL_STATE_KEY, BinaryString.fromString(stateKey)); - row.setField(COL_STATE_PAYLOAD, payload); - row.setField(COL_AGENT_KEY, BinaryString.fromString(key.toString())); + GenericRow row = + GenericRow.of( + BinaryString.fromString(stateKey), + payload, + BinaryString.fromString(key.toString())); // Append to Fluss log for durability, then update in-memory cache. // Synchronous write ensures the record is durable before returning. @@ -197,39 +197,69 @@ public void put(Object key, long seqNum, Action action, Event event, ActionState @Override public ActionState get(Object key, long seqNum, Action action, Event event) throws Exception { String stateKey = generateKey(key, seqNum, action, event); + + boolean hasDivergence = checkDivergence(key.toString(), seqNum); + + if (!actionStates.containsKey(stateKey) || hasDivergence) { + actionStates + .entrySet() + .removeIf( + entry -> { + try { + List parts = ActionStateUtil.parseKey(entry.getKey()); + if (parts.size() >= 2) { + long stateSeqNum = Long.parseLong(parts.get(1)); + return stateSeqNum > seqNum; + } + } catch (NumberFormatException e) { + LOG.warn( + "Failed to parse sequence number from state key: {}", + stateKey); + } + return false; + }); + } + ActionState state = actionStates.get(stateKey); LOG.debug("Lookup action state: key={}, found={}", stateKey, state != null); return state; } + private boolean checkDivergence(String key, long seqNum) { + return actionStates.keySet().stream() + .filter(k -> k.startsWith(key + "_" + seqNum + "_")) + .count() + > 1; + } + /** * Rebuilds in-memory state by scanning the Fluss log table. If recovery markers are provided, * computes the minimum offset per bucket across all markers and subscribes from those offsets. - * Otherwise, subscribes from the beginning. Reads until the log is exhausted ({@link - * #EMPTY_POLL_THRESHOLD} consecutive empty polls). For the same state key appearing multiple - * times in the log, the latest record wins (last-write-wins). - * - *

Note: Unlike the Kafka implementation which skips rebuild on empty markers, this - * implementation scans from the beginning when no markers are provided, enabling full state - * recovery even without prior checkpoints. + * Otherwise, skips rebuild since there is no checkpointed position to recover from. Reads from + * the start offset up to the latest offset captured at rebuild start. For the same state key + * appearing multiple times in the log, the latest record wins (last-write-wins). */ @Override public void rebuildState(List recoveryMarkers) { LOG.info( "Rebuilding action state from Fluss log table with {} recovery markers", recoveryMarkers.size()); - actionStates.clear(); - int numBuckets = table.getTableInfo().getNumBuckets(); + if (recoveryMarkers.isEmpty()) { + LOG.info("No recovery markers, skipping state rebuild"); + return; + } + + actionStates.clear(); // Compute per-bucket start offsets from recovery markers - Map bucketOffsets = new HashMap<>(); + Map bucketStartOffsets = new HashMap<>(); for (Object marker : recoveryMarkers) { if (marker instanceof Map) { @SuppressWarnings("unchecked") Map markerMap = (Map) marker; for (Map.Entry entry : markerMap.entrySet()) { - bucketOffsets.merge(entry.getKey(), entry.getValue(), Math::min); + bucketStartOffsets.merge(entry.getKey(), entry.getValue(), Math::min); } } else if (marker != null) { LOG.warn( @@ -238,40 +268,116 @@ public void rebuildState(List recoveryMarkers) { } } + if (bucketStartOffsets.isEmpty()) { + LOG.info("No valid bucket offsets in recovery markers, skipping state rebuild"); + return; + } + + // Capture the latest offsets as the stopping point for each bucket + Map bucketEndOffsets = getBucketEndOffsets(); + // Capture the earliest available offsets so we can skip buckets whose data has + // been fully cleaned by log retention (earliestOffset >= endOffset). + Map bucketEarliestOffsets = getBucketEarliestOffsets(); + LOG.debug( + "Rebuild window: startOffsets={}, earliestOffsets={}, endOffsets={}", + bucketStartOffsets, + bucketEarliestOffsets, + bucketEndOffsets); + try (LogScanner scanner = table.newScan().createLogScanner()) { - if (bucketOffsets.isEmpty()) { - for (int b = 0; b < numBuckets; b++) { - scanner.subscribeFromBeginning(b); + // Track which buckets still need to be consumed + Map remainingBuckets = new HashMap<>(); + for (Map.Entry entry : bucketStartOffsets.entrySet()) { + int bucket = entry.getKey(); + long startOffset = entry.getValue(); + Long endOffset = bucketEndOffsets.get(bucket); + + // Bucket referenced in recovery marker does not exist in current table + if (endOffset == null) { + LOG.warn( + "Bucket {} referenced in recovery marker does not exist " + + "in current table, state recovery for this bucket is skipped", + bucket); + continue; } - } else { - for (Map.Entry entry : bucketOffsets.entrySet()) { - scanner.subscribe(entry.getKey(), entry.getValue()); + + // No new data since checkpoint (includes empty buckets that never had writes) + if (endOffset <= startOffset) { + LOG.info( + "Skipping bucket {} for rebuild: no new data " + + "(endOffset={} <= startOffset={})", + bucket, + endOffset, + startOffset); + continue; + } + + // Check if retention has cleaned data in the recovery window + Long earliestOffset = bucketEarliestOffsets.get(bucket); + long effectiveStart = startOffset; + if (earliestOffset != null && earliestOffset > startOffset) { + effectiveStart = earliestOffset; + if (effectiveStart >= endOffset) { + // All data in recovery window has been cleaned by retention + LOG.warn( + "Bucket {} state recovery failed: all data between offset {} " + + "and {} has been cleaned by log retention " + + "(earliest available: {})", + bucket, + startOffset, + endOffset, + earliestOffset); + continue; + } + // Partial data loss: some state between startOffset and earliestOffset is gone + LOG.warn( + "Bucket {} partial state loss: data between offset {} and {} " + + "has been cleaned by log retention, " + + "recovering from offset {} instead", + bucket, + startOffset, + earliestOffset, + effectiveStart); } + + scanner.subscribe(bucket, effectiveStart); + remainingBuckets.put(bucket, endOffset); } + LOG.debug("Subscribed buckets for rebuild: {}", remainingBuckets); + + while (!remainingBuckets.isEmpty()) { - int emptyPollCount = 0; - while (emptyPollCount < EMPTY_POLL_THRESHOLD) { ScanRecords records = scanner.poll(POLL_TIMEOUT); - if (records.isEmpty()) { - emptyPollCount++; - } else { - emptyPollCount = 0; - for (TableBucket bucket : records.buckets()) { - for (ScanRecord record : records.records(bucket)) { - InternalRow row = record.getRow(); - String stateKey = row.getString(COL_STATE_KEY).toString(); - byte[] payload = row.getBytes(COL_STATE_PAYLOAD); - try { - ActionState state = - OBJECT_MAPPER.readValue(payload, ActionState.class); - actionStates.put(stateKey, state); - } catch (Exception e) { - LOG.warn( - "Failed to deserialize action state for key: {}, skipping", - stateKey, - e); - } + for (TableBucket bucket : records.buckets()) { + Long endOffset = remainingBuckets.get(bucket.getBucket()); + if (endOffset == null) { + continue; + } + // Track the highest offset seen in this batch (including skipped records + // beyond endOffset) so that we can detect when the bucket has been fully + // consumed even if the last records are past our target window. + long lastSeenOffset = -1; + for (ScanRecord record : records.records(bucket)) { + lastSeenOffset = record.logOffset(); + if (record.logOffset() >= endOffset) { + continue; } + InternalRow row = record.getRow(); + String stateKey = row.getString(COL_STATE_KEY).toString(); + byte[] payload = row.getBytes(COL_STATE_PAYLOAD); + try { + ActionState state = ActionStateSerde.deserialize(payload); + actionStates.put(stateKey, state); + } catch (Exception e) { + LOG.warn( + "Failed to deserialize action state for key: {}, skipping", + stateKey, + e); + } + } + // Remove bucket if the highest seen offset has reached or passed the end + if (lastSeenOffset + 1 >= endOffset) { + remainingBuckets.remove(bucket.getBucket()); } } } @@ -282,28 +388,41 @@ public void rebuildState(List recoveryMarkers) { LOG.info("Completed rebuilding state, recovered {} states", actionStates.size()); } - /** - * Returns the end offsets of each bucket as a recovery marker. Similar to Kafka's - * implementation, this captures the current log position so that {@link #rebuildState} can - * resume from these offsets instead of scanning from the beginning. - */ - @Override - public Object getRecoveryMarker() { - int numBuckets = table.getTableInfo().getNumBuckets(); + @SuppressWarnings("unchecked") + private Map getBucketEndOffsets() { + return getBucketOffsets(new OffsetSpec.LatestSpec()); + } + + @SuppressWarnings("unchecked") + private Map getBucketEarliestOffsets() { + return getBucketOffsets(new OffsetSpec.EarliestSpec()); + } + @SuppressWarnings("unchecked") + private Map getBucketOffsets(OffsetSpec offsetSpec) { + int numBuckets = table.getTableInfo().getNumBuckets(); try (Admin admin = connection.getAdmin()) { List buckets = new ArrayList<>(); for (int b = 0; b < numBuckets; b++) { buckets.add(b); } - return new HashMap<>( - admin.listOffsets(tablePath, buckets, new OffsetSpec.LatestSpec()).all().get()); + return (Map) + admin.listOffsets(tablePath, buckets, offsetSpec).all().get(); } catch (Exception e) { - LOG.error("Failed to get end offsets for Fluss table: {}", tablePath, e); - throw new RuntimeException("Failed to get end offsets for Fluss table", e); + throw new RuntimeException("Failed to get offsets for Fluss table: " + tablePath, e); } } + /** + * Returns the end offsets of each bucket as a recovery marker. Similar to Kafka's + * implementation, this captures the current log position so that {@link #rebuildState} can + * resume from these offsets instead of scanning from the beginning. + */ + @Override + public Object getRecoveryMarker() { + return getBucketEndOffsets(); + } + /** * Evicts pruned states from the in-memory cache. The Fluss log is append-only; physical cleanup * relies on Fluss log retention configuration. @@ -324,50 +443,24 @@ public void pruneState(Object key, long seqNum) { long stateSeqNum = Long.parseLong(parts.get(1)); return stateSeqNum <= seqNum; } - } catch (NumberFormatException e) { - LOG.warn("Failed to parse seqNum from state key: {}", stateKey); + } catch (Exception e) { + LOG.warn("Failed to parse state key: {}", stateKey, e); } } return false; }); - - LOG.debug("Pruned in-memory state for key: {} up to seqNum: {}", key, seqNum); } @Override public void close() throws Exception { - Exception firstException = null; - - try { - if (writer != null) { - writer.flush(); - } - } catch (Exception e) { - firstException = e; - } - try { if (table != null) { table.close(); } - } catch (Exception e) { - if (firstException == null) { - firstException = e; - } - } - - try { + } finally { if (connection != null) { connection.close(); } - } catch (Exception e) { - if (firstException == null) { - firstException = e; - } - } - - if (firstException != null) { - throw firstException; } } @@ -382,16 +475,16 @@ private void maybeCreateDatabaseAndTable() { // No primaryKey() call — this creates an append-only log table in Fluss. Schema schema = Schema.newBuilder() - .column("state_key", DataTypes.STRING()) - .column("state_payload", DataTypes.BYTES()) - .column("agent_key", DataTypes.STRING()) + .column(COL_NAME_STATE_KEY, DataTypes.STRING()) + .column(COL_NAME_STATE_PAYLOAD, DataTypes.BYTES()) + .column(COL_NAME_AGENT_KEY, DataTypes.STRING()) .build(); int buckets = agentConfiguration.get(FLUSS_ACTION_STATE_TABLE_BUCKETS); TableDescriptor descriptor = TableDescriptor.builder() .schema(schema) - .distributedBy(buckets, "agent_key") + .distributedBy(buckets, COL_NAME_AGENT_KEY) .comment("Flink Agents action state log") .build(); diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStore.java index 648f146be..f09e5cd29 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStore.java @@ -194,7 +194,9 @@ public ActionState get(Object key, long seqNum, Action action, Event event) thro } private boolean checkDivergence(String key, long seqNum) { - return actionStates.keySet().stream().filter(k -> k.startsWith(key + "_" + seqNum)).count() + return actionStates.keySet().stream() + .filter(k -> k.startsWith(key + "_" + seqNum + "_")) + .count() > 1; } @@ -211,6 +213,7 @@ public void rebuildState(List recoveryMarkers) { // Process recovery markers to get the smallest offsets for each partition for (Object marker : recoveryMarkers) { if (marker instanceof Map) { + @SuppressWarnings("unchecked") Map markerMap = (Map) marker; for (Map.Entry entry : markerMap.entrySet()) { Long offset = diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateSerdeTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateSerdeTest.java index ca5103321..e0c4de0bf 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateSerdeTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateSerdeTest.java @@ -53,16 +53,13 @@ public void testActionStateSerializationDeserialization() throws Exception { originalState.addShortTermMemoryUpdate(shortTermMemoryUpdate); originalState.addEvent(outputEvent); - // Test Kafka seder/deserializer - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - // Serialize - byte[] serialized = seder.serialize("test-topic", originalState); + byte[] serialized = ActionStateSerde.serialize(originalState); assertNotNull(serialized); assertTrue(serialized.length > 0); // Deserialize - ActionState deserializedState = seder.deserialize("test-topic", serialized); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); assertNotNull(deserializedState); // Verify taskEvent @@ -102,10 +99,8 @@ public void testActionStateWithNullTaskEvent() throws Exception { originalState.addSensoryMemoryUpdate(memoryUpdate); // Test serialization/deserialization - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - - byte[] serialized = seder.serialize("test-topic", originalState); - ActionState deserializedState = seder.deserialize("test-topic", serialized); + byte[] serialized = ActionStateSerde.serialize(originalState); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); // Verify taskEvent is null assertNull(deserializedState.getTaskEvent()); @@ -128,10 +123,8 @@ public void testActionStateWithComplexAttributes() throws Exception { ActionState originalState = new ActionState(inputEvent); // Test serialization/deserialization - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - - byte[] serialized = seder.serialize("test-topic", originalState); - ActionState deserializedState = seder.deserialize("test-topic", serialized); + byte[] serialized = ActionStateSerde.serialize(originalState); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); // Verify complex attributes are preserved InputEvent deserializedInputEvent = (InputEvent) deserializedState.getTaskEvent(); @@ -157,10 +150,8 @@ public void testActionStateWithCallResults() throws Exception { originalState.addCallResult(result2); // Test serialization/deserialization - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - - byte[] serialized = seder.serialize("test-topic", originalState); - ActionState deserializedState = seder.deserialize("test-topic", serialized); + byte[] serialized = ActionStateSerde.serialize(originalState); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); // Verify call results assertEquals(2, deserializedState.getCallResultCount()); @@ -186,10 +177,8 @@ public void testActionStateWithPendingCallResult() throws Exception { ActionState originalState = new ActionState(inputEvent); originalState.addCallResult(CallResult.pending("module.func", "digest")); - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - - byte[] serialized = seder.serialize("test-topic", originalState); - ActionState deserializedState = seder.deserialize("test-topic", serialized); + byte[] serialized = ActionStateSerde.serialize(originalState); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); assertEquals(1, deserializedState.getCallResultCount()); CallResult result = deserializedState.getCallResult(0); @@ -215,10 +204,8 @@ public void testActionStateWithCompletedFlag() throws Exception { inputEvent, sensoryUpdates, shortTermUpdates, outputEvents, null, true); // Test serialization/deserialization - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - - byte[] serialized = seder.serialize("test-topic", originalState); - ActionState deserializedState = seder.deserialize("test-topic", serialized); + byte[] serialized = ActionStateSerde.serialize(originalState); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); // Verify completed flag assertTrue(deserializedState.isCompleted()); @@ -242,10 +229,8 @@ public void testActionStateInProgressWithCallResults() throws Exception { new ActionState(inputEvent, null, null, null, callResults, false); // Test serialization/deserialization - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - - byte[] serialized = seder.serialize("test-topic", originalState); - ActionState deserializedState = seder.deserialize("test-topic", serialized); + byte[] serialized = ActionStateSerde.serialize(originalState); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); // Verify state assertFalse(deserializedState.isCompleted()); @@ -261,10 +246,8 @@ public void testCallResultWithNullPayloads() throws Exception { ActionState originalState = new ActionState(inputEvent); originalState.addCallResult(new CallResult("func", "digest", null, null)); - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); - - byte[] serialized = seder.serialize("test-topic", originalState); - ActionState deserializedState = seder.deserialize("test-topic", serialized); + byte[] serialized = ActionStateSerde.serialize(originalState); + ActionState deserializedState = ActionStateSerde.deserialize(serialized); assertEquals(1, deserializedState.getCallResultCount()); CallResult result = deserializedState.getCallResult(0); @@ -310,9 +293,8 @@ public void testDeserializeLegacyCallResultWithoutStatus() throws Exception { + "\"completed\":false" + "}"; - ActionStateKafkaSeder seder = new ActionStateKafkaSeder(); ActionState deserializedState = - seder.deserialize("test-topic", json.getBytes(StandardCharsets.UTF_8)); + ActionStateSerde.deserialize(json.getBytes(StandardCharsets.UTF_8)); assertEquals(2, deserializedState.getCallResultCount()); @@ -326,4 +308,29 @@ public void testDeserializeLegacyCallResultWithoutStatus() throws Exception { assertArrayEquals( "exception".getBytes(StandardCharsets.UTF_8), legacyFailure.getExceptionPayload()); } + + @Test + public void testKafkaSederDelegatesToActionStateSerde() throws Exception { + InputEvent inputEvent = new InputEvent("test delegation"); + ActionState originalState = new ActionState(inputEvent); + + // Serialize via ActionStateSerde, deserialize via Kafka seder (and vice versa) + ActionStateKafkaSeder kafkaSeder = new ActionStateKafkaSeder(); + + byte[] serializedBySerde = ActionStateSerde.serialize(originalState); + byte[] serializedByKafka = kafkaSeder.serialize("test-topic", originalState); + + ActionState fromSerde = ActionStateSerde.deserialize(serializedByKafka); + ActionState fromKafka = kafkaSeder.deserialize("test-topic", serializedBySerde); + + // Both should produce identical results + assertArrayEquals(serializedBySerde, serializedByKafka); + assertEquals( + ((InputEvent) fromSerde.getTaskEvent()).getInput(), + ((InputEvent) fromKafka.getTaskEvent()).getInput()); + + // Kafka seder should handle nulls + assertNull(kafkaSeder.serialize("test-topic", null)); + assertNull(kafkaSeder.deserialize("test-topic", null)); + } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java index fbe500b36..038d8dc74 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java @@ -17,10 +17,7 @@ */ package org.apache.flink.agents.runtime.actionstate; -import org.apache.flink.agents.api.Event; import org.apache.flink.agents.api.InputEvent; -import org.apache.flink.agents.api.context.RunnerContext; -import org.apache.flink.agents.plan.JavaFunction; import org.apache.flink.agents.plan.actions.Action; import org.junit.jupiter.api.Test; @@ -207,21 +204,4 @@ public void testParseKeyConsistencyWithDifferentKeys() throws Exception { assertEquals(parsed1.get(2), parsed2.get(2)); // Event UUID assertEquals(parsed1.get(3), parsed2.get(3)); // Action UUID } - - private static class TestAction extends Action { - - public static void doNothing(Event event, RunnerContext context) { - // No operation - } - - public TestAction(String name) throws Exception { - super( - name, - new JavaFunction( - TestAction.class.getName(), - "doNothing", - new Class[] {Event.class, RunnerContext.class}), - List.of(InputEvent.class.getName())); - } - } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java index f830752a2..c885c4734 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java @@ -19,9 +19,7 @@ import org.apache.flink.agents.api.Event; import org.apache.flink.agents.api.InputEvent; -import org.apache.flink.agents.api.context.RunnerContext; import org.apache.flink.agents.plan.AgentConfiguration; -import org.apache.flink.agents.plan.JavaFunction; import org.apache.flink.agents.plan.actions.Action; import org.apache.fluss.client.admin.Admin; import org.apache.fluss.metadata.TableInfo; @@ -141,9 +139,11 @@ void testPruneSingleKey() throws Exception { store.pruneState(TEST_KEY, 2L); // pruneState is synchronous (in-memory eviction) + // Check surviving entry first: get() with a missing key triggers divergence cleanup + // that removes entries with higher seqNums (same as Kafka backend behavior). + assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNull(); - assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); } @Test @@ -160,11 +160,12 @@ void testPruneMultipleAgentKeys() throws Exception { store.pruneState(keyA, 2L); // pruneState is synchronous (in-memory eviction) - assertThat(store.get(keyA, 1L, testAction, testEvent)).isNull(); - assertThat(store.get(keyA, 2L, testAction, testEvent)).isNull(); - // keyB entries unaffected + // Check surviving entries first: get() with a missing key triggers divergence cleanup + // that removes entries with higher seqNums (same as Kafka backend behavior). assertThat(store.get(keyB, 1L, testAction, testEvent)).isNotNull(); assertThat(store.get(keyB, 2L, testAction, testEvent)).isNotNull(); + assertThat(store.get(keyA, 1L, testAction, testEvent)).isNull(); + assertThat(store.get(keyA, 2L, testAction, testEvent)).isNull(); } // ==================== Recovery ==================== @@ -183,12 +184,14 @@ void testRecoveryMarkerReturnsBucketOffsets() { } @Test - void testRebuildStateFromBeginning() throws Exception { + void testRebuildStateWithEmptyMarkersSkipsRebuild() throws Exception { store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); - // rebuildState with empty markers should scan from beginning + // rebuildState with empty markers should skip rebuild (aligned with Kafka backend) store.rebuildState(Collections.emptyList()); + // The in-memory cache is not cleared when rebuild is skipped, + // so the state should still be accessible assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); } @@ -197,24 +200,41 @@ void testRebuildStateFromBeginning() throws Exception { void testRebuildStateWithRecoveryMarkers() throws Exception { store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); - // Capture recovery marker after writing data + // Capture recovery marker after writing data (simulates checkpoint boundary) Object marker = store.getRecoveryMarker(); - // Write more data after the marker + // Write more data after the marker (simulates writes between checkpoint and crash) store.put(TEST_KEY, 2L, testAction, testEvent, new ActionState(testEvent)); - // Rebuild using the marker; should replay from marker offset and recover data after it - store.rebuildState(List.of(marker)); + // Close to ensure all writes are fully committed before recovery + store.close(); - // Data written before the marker should NOT be in the rebuilt cache - // (rebuildState clears the cache and only replays from the marker offset) - assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); - // Data written after the marker should be recovered - assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); + // Simulate recovery: new store instance + FlussActionStateStore recoveredStore = + new FlussActionStateStore(createAgentConfiguration()); + try { + // Rebuild using the marker; should replay from marker offset to current end + recoveredStore.rebuildState(List.of(marker)); + + // Check surviving entry first: get() with a missing key triggers divergence cleanup + // that removes entries with higher seqNums (same as Kafka backend behavior). + // Data written after the marker should be recovered + assertThat(recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); + // Data written before the marker should NOT be in the rebuilt cache + assertThat(recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); + } finally { + recoveredStore.close(); + // Prevent double-close in tearDown + store = null; + } } @Test void testReconcilePendingPersistence() throws Exception { + // Capture recovery marker BEFORE writing data. In real recovery, the marker represents + // the checkpoint boundary: data written after the marker is recovered via rebuildState. + Object marker = store.getRecoveryMarker(); + ActionState state = new ActionState(testEvent); CallResult pending = CallResult.pending("sideEffectFn", "digest456"); state.addCallResult(pending); @@ -226,8 +246,8 @@ void testReconcilePendingPersistence() throws Exception { FlussActionStateStore recoveredStore = new FlussActionStateStore(createAgentConfiguration()); try { - // Rebuild state from the log - recoveredStore.rebuildState(Collections.emptyList()); + // Rebuild state from the log using recovery markers + recoveredStore.rebuildState(List.of(marker)); ActionState recovered = recoveredStore.get(TEST_KEY, 1L, testAction, testEvent); assertThat(recovered).isNotNull(); @@ -243,6 +263,9 @@ void testReconcilePendingPersistence() throws Exception { @Test void testPruneWorksAfterRecovery() throws Exception { + // Capture recovery marker BEFORE writing data. + Object marker = store.getRecoveryMarker(); + store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); store.put(TEST_KEY, 2L, testAction, testEvent, new ActionState(testEvent)); store.put(TEST_KEY, 3L, testAction, testEvent, new ActionState(testEvent)); @@ -252,8 +275,8 @@ void testPruneWorksAfterRecovery() throws Exception { FlussActionStateStore recoveredStore = new FlussActionStateStore(createAgentConfiguration()); try { - // Rebuild state from the log - recoveredStore.rebuildState(Collections.emptyList()); + // Rebuild state from the log using recovery markers + recoveredStore.rebuildState(List.of(marker)); assertThat(recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); assertThat(recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); @@ -261,10 +284,10 @@ void testPruneWorksAfterRecovery() throws Exception { recoveredStore.pruneState(TEST_KEY, 2L); - // pruneState is synchronous (in-memory eviction) + // Check surviving entry first (get() divergence cleanup side-effect) + assertThat(recoveredStore.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); assertThat(recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); assertThat(recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)).isNull(); - assertThat(recoveredStore.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); } finally { recoveredStore.close(); } @@ -314,21 +337,4 @@ private void waitForTableReady() throws Exception { FLUSS_CLUSTER.waitUntilTableReady(tableInfo.getTableId()); } } - - private static class TestAction extends Action { - - public static void doNothing(Event event, RunnerContext context) { - // No operation - } - - public TestAction(String name) throws Exception { - super( - name, - new JavaFunction( - TestAction.class.getName(), - "doNothing", - new Class[] {Event.class, RunnerContext.class}), - List.of(InputEvent.class.getName())); - } - } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java index d2f948f40..8d4752b63 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java @@ -17,120 +17,157 @@ */ package org.apache.flink.agents.runtime.actionstate; -import com.fasterxml.jackson.annotation.JsonTypeInfo; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.flink.agents.api.Event; import org.apache.flink.agents.api.InputEvent; -import org.apache.flink.agents.api.OutputEvent; +import org.apache.flink.agents.plan.actions.Action; +import org.apache.fluss.client.Connection; +import org.apache.fluss.client.table.Table; +import org.apache.fluss.client.table.writer.AppendWriter; +import org.apache.fluss.row.InternalRow; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletableFuture; import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; -/** - * Unit tests for {@link FlussActionStateStore} serialization and in-memory behavior. These tests - * verify the ObjectMapper configuration and ActionState serialization round-trips without requiring - * a real Fluss cluster. - */ +/** Unit tests for {@link FlussActionStateStore} store-level behavior. */ public class FlussActionStateStoreTest { - private ObjectMapper objectMapper; + private static final String TEST_KEY = "test-key"; - @JsonTypeInfo( - use = JsonTypeInfo.Id.CLASS, - include = JsonTypeInfo.As.PROPERTY, - property = "@class") - abstract static class EventTypeInfoMixin {} + private AppendWriter mockWriter; + private FlussActionStateStore store; + private Action testAction; + private Event testEvent; + private ActionState testActionState; + private Map actionStates; @BeforeEach - void setUp() { - objectMapper = new ObjectMapper(); - objectMapper.addMixIn(Event.class, EventTypeInfoMixin.class); - objectMapper.addMixIn(InputEvent.class, EventTypeInfoMixin.class); - objectMapper.addMixIn(OutputEvent.class, EventTypeInfoMixin.class); + void setUp() throws Exception { + mockWriter = mock(AppendWriter.class); + when(mockWriter.append(any(InternalRow.class))) + .thenReturn(CompletableFuture.completedFuture(null)); + + actionStates = new HashMap<>(); + store = + new FlussActionStateStore( + actionStates, mock(Connection.class), mock(Table.class), mockWriter); + + testAction = new TestAction("test-action"); + testEvent = new InputEvent("test data"); + testActionState = new ActionState(testEvent); } @Test - void testCompletionOnlyFlow() throws Exception { - InputEvent taskEvent = new InputEvent("test-data"); - ActionState state = new ActionState(taskEvent); - CallResult successResult = - new CallResult( - "myFunction", - "argsDigest123", - "result-payload".getBytes(StandardCharsets.UTF_8)); - state.addCallResult(successResult); - - byte[] payload = objectMapper.writeValueAsBytes(state); - ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + void testPutActionState() throws Exception { + store.put(TEST_KEY, 1L, testAction, testEvent, testActionState); - assertThat(retrieved).isNotNull(); - assertThat(retrieved.getCallResults()).hasSize(1); - CallResult retrievedResult = retrieved.getCallResult(0); - assertThat(retrievedResult.getFunctionId()).isEqualTo("myFunction"); - assertThat(retrievedResult.getArgsDigest()).isEqualTo("argsDigest123"); - assertThat(retrievedResult.isSuccess()).isTrue(); - assertThat(retrievedResult.getResultPayload()) - .isEqualTo("result-payload".getBytes(StandardCharsets.UTF_8)); + verify(mockWriter).append(any(InternalRow.class)); + + String stateKey = ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent); + assertThat(actionStates).containsKey(stateKey); + assertThat(actionStates.get(stateKey)).isEqualTo(testActionState); } @Test - void testMultipleCallResults() throws Exception { - InputEvent taskEvent = new InputEvent("test-data"); - ActionState state = new ActionState(taskEvent); - state.addCallResult( - new CallResult("fn1", "d1", "ok".getBytes(StandardCharsets.UTF_8))); // SUCCEEDED - state.addCallResult(CallResult.pending("fn2", "d2")); // PENDING - state.addCallResult( - new CallResult( - "fn3", "d3", null, "error".getBytes(StandardCharsets.UTF_8))); // FAILED - - byte[] payload = objectMapper.writeValueAsBytes(state); - ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + void testGetNonExistentActionState() throws Exception { + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent), testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent), testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 4L, testAction, testEvent), testActionState); + + // Request a key with a different action triggers divergence cleanup + store.get(TEST_KEY, 2L, new TestAction("test-1"), testEvent); + + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); + assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); + assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNull(); + assertThat(store.get(TEST_KEY, 4L, testAction, testEvent)).isNull(); + } - assertThat(retrieved).isNotNull(); - assertThat(retrieved.getCallResults()).hasSize(3); - assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); - assertThat(retrieved.getCallResult(1).isPending()).isTrue(); - assertThat(retrieved.getCallResult(2).isFailure()).isTrue(); + @Test + void testGetActionStateWithDiverge() throws Exception { + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent), testActionState); + // diverge: same key+seqNum, different action + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 2L, new TestAction("test-2"), testEvent), + testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent), testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 4L, testAction, testEvent), testActionState); + + store.get(TEST_KEY, 2L, testAction, testEvent); + + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); + assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); + assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNull(); + assertThat(store.get(TEST_KEY, 4L, testAction, testEvent)).isNull(); } @Test - void testCompletedAction() throws Exception { - InputEvent taskEvent = new InputEvent("test-data"); - ActionState state = new ActionState(taskEvent); - state.addCallResult(new CallResult("fn1", "d1", "result".getBytes(StandardCharsets.UTF_8))); - state.markCompleted(); + void testPruneState() throws Exception { + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent), testActionState); + actionStates.put( + ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent), testActionState); + + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); + assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); + assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); + + // Prune states up to sequence number 2 + store.pruneState(TEST_KEY, 2L); + + assertThat( + actionStates.get( + ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent))) + .isNull(); + assertThat( + actionStates.get( + ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent))) + .isNull(); + assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); + } + + @Test + void testActionStateUpdates() throws Exception { + store.put(TEST_KEY, 1L, testAction, testEvent, testActionState); - byte[] payload = objectMapper.writeValueAsBytes(state); - ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + // Update the same key with a new state + InputEvent updatedEvent = new InputEvent("updated data"); + ActionState updatedState = new ActionState(updatedEvent); + store.put(TEST_KEY, 1L, testAction, testEvent, updatedState); + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); assertThat(retrieved).isNotNull(); - assertThat(retrieved.isCompleted()).isTrue(); - assertThat(retrieved.getCallResults()).isEmpty(); + assertThat(retrieved.getTaskEvent()).isEqualTo(updatedEvent); } @Test - void testFullSerializationRoundTrip() throws Exception { - InputEvent taskEvent = new InputEvent("full-test-data"); - ActionState state = new ActionState(taskEvent); - state.addEvent(new InputEvent("output-event-1")); - state.addEvent(new InputEvent("output-event-2")); - state.addCallResult( - new CallResult("fn1", "digest1", "payload1".getBytes(StandardCharsets.UTF_8))); - state.addCallResult(CallResult.pending("fn2", "digest2")); - - byte[] payload = objectMapper.writeValueAsBytes(state); - ActionState retrieved = objectMapper.readValue(payload, ActionState.class); + void testRebuildStateWithEmptyMarkers() throws Exception { + store.put(TEST_KEY, 1L, testAction, testEvent, testActionState); - assertThat(retrieved).isNotNull(); - assertThat(retrieved.getTaskEvent()).isEqualTo(taskEvent); - assertThat(retrieved.getOutputEvents()).hasSize(2); - assertThat(retrieved.getCallResults()).hasSize(2); - assertThat(retrieved.getCallResult(0).isSuccess()).isTrue(); - assertThat(retrieved.getCallResult(1).isPending()).isTrue(); - assertThat(retrieved.isCompleted()).isFalse(); + // rebuildState with empty markers should skip rebuild and preserve existing cache + store.rebuildState(Collections.emptyList()); + + assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java index cd32524be..33b4193dd 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java @@ -19,9 +19,7 @@ import org.apache.flink.agents.api.Event; import org.apache.flink.agents.api.InputEvent; -import org.apache.flink.agents.api.context.RunnerContext; import org.apache.flink.agents.plan.AgentConfiguration; -import org.apache.flink.agents.plan.JavaFunction; import org.apache.flink.agents.plan.actions.Action; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.MockConsumer; @@ -256,21 +254,4 @@ void testRebuildState() throws Exception { ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent))) .isEqualTo(thirdState); } - - private static class TestAction extends Action { - - public static void doNothing(Event event, RunnerContext context) { - // No operation - } - - public TestAction(String name) throws Exception { - super( - name, - new JavaFunction( - TestAction.class.getName(), - "doNothing", - new Class[] {Event.class, RunnerContext.class}), - List.of(InputEvent.class.getName())); - } - } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/TestAction.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/TestAction.java new file mode 100644 index 000000000..de097c5a2 --- /dev/null +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/TestAction.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.agents.runtime.actionstate; + +import org.apache.flink.agents.api.Event; +import org.apache.flink.agents.api.InputEvent; +import org.apache.flink.agents.api.context.RunnerContext; +import org.apache.flink.agents.plan.JavaFunction; +import org.apache.flink.agents.plan.actions.Action; + +import java.util.List; + +/** A no-op {@link Action} for use in action state store tests. */ +public class TestAction extends Action { + + public static void doNothing(Event event, RunnerContext context) { + // No operation + } + + public TestAction(String name) throws Exception { + super( + name, + new JavaFunction( + TestAction.class.getName(), + "doNothing", + new Class[] {Event.class, RunnerContext.class}), + List.of(InputEvent.class.getName())); + } +} From db93e1971a735247871af302a2b17a6d2dc2a5a4 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Wed, 29 Apr 2026 14:54:19 +0800 Subject: [PATCH 5/9] refactor(runtime): Remove redundant comments from the FlussActionStateStore class --- .../actionstate/FlussActionStateStore.java | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java index d31900f06..2af543232 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -71,22 +71,6 @@ * An implementation of {@link ActionStateStore} that uses an Apache Fluss log table as the backend. * All state is maintained in an in-memory map for fast lookups, with the Fluss log table providing * durability and recovery support. - * - *
    - *
  • Write: Append to Fluss log + update in-memory cache (synchronous). - *
  • Read: In-memory lookup with divergence detection. - *
  • Recovery: Scan log from checkpoint offsets (recovery markers) to the latest offset. - *
  • Prune: In-memory eviction only; physical cleanup relies on Fluss log retention. - *
- * - *

The Fluss log table schema: - * - *

- * state_key     STRING  -- Generated by ActionStateUtil.generateKey()
- * state_payload BYTES   -- ActionState JSON serialized bytes
- * agent_key     STRING  -- Original input key; used as bucket key so all records for the same
- *                          input key are co-located in the same bucket
- * 
*/ public class FlussActionStateStore implements ActionStateStore { From 8399e201cfe5cb97361978ff1770dc37620aeadc Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Wed, 29 Apr 2026 15:52:29 +0800 Subject: [PATCH 6/9] test(FlussActionStateStore): Remove unnecessary integration tests and optimize helper methods --- .../actionstate/FlussActionStateStoreIT.java | 103 +++--------------- .../FlussActionStateStoreTest.java | 11 -- 2 files changed, 17 insertions(+), 97 deletions(-) diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java index c885c4734..947fa1414 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java @@ -39,7 +39,6 @@ import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE_BUCKETS; import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_BOOTSTRAP_SERVERS; import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatNoException; /** Integration tests for {@link FlussActionStateStore} against an embedded Fluss cluster. */ public class FlussActionStateStoreIT { @@ -146,28 +145,6 @@ void testPruneSingleKey() throws Exception { assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNull(); } - @Test - void testPruneMultipleAgentKeys() throws Exception { - String keyA = "agent-key-a"; - String keyB = "agent-key-b"; - - store.put(keyA, 1L, testAction, testEvent, new ActionState(testEvent)); - store.put(keyA, 2L, testAction, testEvent, new ActionState(testEvent)); - store.put(keyB, 1L, testAction, testEvent, new ActionState(testEvent)); - store.put(keyB, 2L, testAction, testEvent, new ActionState(testEvent)); - - // Prune only keyA up to seqNum 2 - store.pruneState(keyA, 2L); - - // pruneState is synchronous (in-memory eviction) - // Check surviving entries first: get() with a missing key triggers divergence cleanup - // that removes entries with higher seqNums (same as Kafka backend behavior). - assertThat(store.get(keyB, 1L, testAction, testEvent)).isNotNull(); - assertThat(store.get(keyB, 2L, testAction, testEvent)).isNotNull(); - assertThat(store.get(keyA, 1L, testAction, testEvent)).isNull(); - assertThat(store.get(keyA, 2L, testAction, testEvent)).isNull(); - } - // ==================== Recovery ==================== @Test @@ -229,38 +206,6 @@ void testRebuildStateWithRecoveryMarkers() throws Exception { } } - @Test - void testReconcilePendingPersistence() throws Exception { - // Capture recovery marker BEFORE writing data. In real recovery, the marker represents - // the checkpoint boundary: data written after the marker is recovered via rebuildState. - Object marker = store.getRecoveryMarker(); - - ActionState state = new ActionState(testEvent); - CallResult pending = CallResult.pending("sideEffectFn", "digest456"); - state.addCallResult(pending); - - store.put(TEST_KEY, 1L, testAction, testEvent, state); - store.close(); - - // Create a new store instance (simulating recovery) - FlussActionStateStore recoveredStore = - new FlussActionStateStore(createAgentConfiguration()); - try { - // Rebuild state from the log using recovery markers - recoveredStore.rebuildState(List.of(marker)); - ActionState recovered = recoveredStore.get(TEST_KEY, 1L, testAction, testEvent); - - assertThat(recovered).isNotNull(); - assertThat(recovered.getCallResults()).hasSize(1); - CallResult recoveredResult = recovered.getCallResult(0); - assertThat(recoveredResult.isPending()).isTrue(); - assertThat(recoveredResult.getFunctionId()).isEqualTo("sideEffectFn"); - assertThat(recoveredResult.getArgsDigest()).isEqualTo("digest456"); - } finally { - recoveredStore.close(); - } - } - @Test void testPruneWorksAfterRecovery() throws Exception { // Capture recovery marker BEFORE writing data. @@ -293,46 +238,32 @@ void testPruneWorksAfterRecovery() throws Exception { } } - // ==================== Infrastructure ==================== - - @Test - void testIdempotentTableCreation() throws Exception { - // Creating a second store with the same config should succeed - // (table already exists, createTable with ignoreIfExists=true) - FlussActionStateStore secondStore = new FlussActionStateStore(createAgentConfiguration()); - try { - secondStore.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); - assertThat(secondStore.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); - } finally { - secondStore.close(); - } - } - - @Test - void testCloseIdempotent() throws Exception { - store.close(); - assertThatNoException().isThrownBy(() -> store.close()); - // Set to null to prevent double-close in tearDown - store = null; - } - // ==================== Helpers ==================== private AgentConfiguration createAgentConfiguration() { + return createAgentConfiguration(TEST_DATABASE, TEST_TABLE, 1); + } + + private AgentConfiguration createAgentConfiguration( + String database, String table, int buckets) { AgentConfiguration config = new AgentConfiguration(); config.set(FLUSS_BOOTSTRAP_SERVERS, FLUSS_CLUSTER.getBootstrapServers()); - config.set(FLUSS_ACTION_STATE_DATABASE, TEST_DATABASE); - config.set(FLUSS_ACTION_STATE_TABLE, TEST_TABLE); - config.set(FLUSS_ACTION_STATE_TABLE_BUCKETS, 1); + config.set(FLUSS_ACTION_STATE_DATABASE, database); + config.set(FLUSS_ACTION_STATE_TABLE, table); + config.set(FLUSS_ACTION_STATE_TABLE_BUCKETS, buckets); return config; } private void waitForTableReady() throws Exception { - TablePath tablePath = TablePath.of(TEST_DATABASE, TEST_TABLE); - try (Admin admin = - org.apache.fluss.client.ConnectionFactory.createConnection( - FLUSS_CLUSTER.getClientConfig()) - .getAdmin()) { + waitForTableReady(TEST_DATABASE, TEST_TABLE); + } + + private void waitForTableReady(String database, String table) throws Exception { + TablePath tablePath = TablePath.of(database, table); + try (org.apache.fluss.client.Connection conn = + org.apache.fluss.client.ConnectionFactory.createConnection( + FLUSS_CLUSTER.getClientConfig()); + Admin admin = conn.getAdmin()) { TableInfo tableInfo = admin.getTableInfo(tablePath).get(); FLUSS_CLUSTER.waitUntilTableReady(tableInfo.getTableId()); } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java index 8d4752b63..54b34dc1c 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java @@ -27,7 +27,6 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.concurrent.CompletableFuture; @@ -160,14 +159,4 @@ void testActionStateUpdates() throws Exception { assertThat(retrieved).isNotNull(); assertThat(retrieved.getTaskEvent()).isEqualTo(updatedEvent); } - - @Test - void testRebuildStateWithEmptyMarkers() throws Exception { - store.put(TEST_KEY, 1L, testAction, testEvent, testActionState); - - // rebuildState with empty markers should skip rebuild and preserve existing cache - store.rebuildState(Collections.emptyList()); - - assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); - } } From ff9e1471a52efb6c56d86ac0568317049e902fbf Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Wed, 29 Apr 2026 19:51:29 +0800 Subject: [PATCH 7/9] Refactor FlussActionStateStore recovery with dynamic SASL support, add multi-bucket and SASL integration tests, enhance unit test coverage, and update security protocol docs. --- .../api/configuration/AgentConfigOptions.java | 6 +- docs/content/docs/operations/configuration.md | 2 +- .../actionstate/FlussActionStateStore.java | 276 +++++++++--------- .../actionstate/ActionStateUtilTest.java | 16 +- .../actionstate/FlussActionStateStoreIT.java | 61 +++- .../FlussActionStateStoreSaslIT.java | 168 +++++++++++ .../FlussActionStateStoreTest.java | 111 +++---- .../KafkaActionStateStoreTest.java | 6 +- .../{TestAction.java => NoOpAction.java} | 6 +- 9 files changed, 446 insertions(+), 206 deletions(-) create mode 100644 runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreSaslIT.java rename runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/{TestAction.java => NoOpAction.java} (91%) diff --git a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java index 441b414e1..e936ce5a5 100644 --- a/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java +++ b/api/src/main/java/org/apache/flink/agents/api/configuration/AgentConfigOptions.java @@ -67,7 +67,11 @@ public class AgentConfigOptions { public static final ConfigOption FLUSS_ACTION_STATE_TABLE_BUCKETS = new ConfigOption<>("flussActionStateTableBuckets", Integer.class, 64); - /** The config parameter specifies the authentication protocol for Fluss client. */ + /** + * The config parameter specifies the authentication protocol for Fluss client. Valid values: + * {@code "PLAINTEXT"} (default, no authentication) and {@code "SASL"} (SASL/PLAIN + * authentication). Value is case-insensitive. + */ public static final ConfigOption FLUSS_SECURITY_PROTOCOL = new ConfigOption<>("flussSecurityProtocol", String.class, "PLAINTEXT"); diff --git a/docs/content/docs/operations/configuration.md b/docs/content/docs/operations/configuration.md index 757eb2649..1b9b477ab 100644 --- a/docs/content/docs/operations/configuration.md +++ b/docs/content/docs/operations/configuration.md @@ -167,7 +167,7 @@ Here are the configuration options for Fluss-based Action State Store. | `flussActionStateDatabase` | "flink_agents" | String | The Fluss database name for storing action state. | | `flussActionStateTable` | "action_states" | String | The Fluss table name for storing action state. | | `flussActionStateTableBuckets` | 64 | Integer | The number of buckets for the Fluss action state table. | -| `flussSecurityProtocol` | "PLAINTEXT" | String | The authentication protocol for Fluss client (e.g., `PLAINTEXT`, `SASL_PLAIN`). | +| `flussSecurityProtocol` | "PLAINTEXT" | String | The authentication protocol for Fluss client. Valid values: `PLAINTEXT` (default, no authentication), `SASL` (SASL/PLAIN authentication). | | `flussSaslMechanism` | "PLAIN" | String | The SASL mechanism for Fluss authentication. | | `flussSaslJaasConfig` | (none) | String | The JAAS configuration string for Fluss SASL authentication. | | `flussSaslUsername` | (none) | String | The username for Fluss SASL authentication. | diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java index 2af543232..ebbf8a40c 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -129,22 +129,27 @@ public FlussActionStateStore(AgentConfiguration agentConfiguration) { // that each append is sent immediately without waiting for additional records to batch. flussConf.set(ConfigOptions.CLIENT_WRITER_BATCH_TIMEOUT, Duration.ZERO); - flussConf.setString( - CLIENT_SECURITY_PROTOCOL, agentConfiguration.get(FLUSS_SECURITY_PROTOCOL)); - - flussConf.setString(CLIENT_SASL_MECHANISM, agentConfiguration.get(FLUSS_SASL_MECHANISM)); - - String jaasConfig = agentConfiguration.get(FLUSS_SASL_JAAS_CONFIG); - if (jaasConfig != null) { - flussConf.setString(CLIENT_SASL_JAAS_CONFIG, jaasConfig); - } - String username = agentConfiguration.get(FLUSS_SASL_USERNAME); - if (username != null) { - flussConf.setString(CLIENT_SASL_JAAS_USERNAME, username); - } - String password = agentConfiguration.get(FLUSS_SASL_PASSWORD); - if (password != null) { - flussConf.setString(CLIENT_SASL_JAAS_PASSWORD, password); + // Only set security/SASL parameters when the protocol requires authentication. + // When PLAINTEXT (the default), SASL parameters are semantically invalid and may + // cause the Fluss client to attempt an unwanted SASL handshake. + String securityProtocol = agentConfiguration.get(FLUSS_SECURITY_PROTOCOL); + flussConf.setString(CLIENT_SECURITY_PROTOCOL, securityProtocol); + if (!"PLAINTEXT".equalsIgnoreCase(securityProtocol)) { + flussConf.setString( + CLIENT_SASL_MECHANISM, agentConfiguration.get(FLUSS_SASL_MECHANISM)); + + String jaasConfig = agentConfiguration.get(FLUSS_SASL_JAAS_CONFIG); + if (jaasConfig != null) { + flussConf.setString(CLIENT_SASL_JAAS_CONFIG, jaasConfig); + } + String username = agentConfiguration.get(FLUSS_SASL_USERNAME); + if (username != null) { + flussConf.setString(CLIENT_SASL_JAAS_USERNAME, username); + } + String password = agentConfiguration.get(FLUSS_SASL_PASSWORD); + if (password != null) { + flussConf.setString(CLIENT_SASL_JAAS_PASSWORD, password); + } } this.connection = ConnectionFactory.createConnection(flussConf); @@ -236,31 +241,13 @@ public void rebuildState(List recoveryMarkers) { actionStates.clear(); - // Compute per-bucket start offsets from recovery markers - Map bucketStartOffsets = new HashMap<>(); - for (Object marker : recoveryMarkers) { - if (marker instanceof Map) { - @SuppressWarnings("unchecked") - Map markerMap = (Map) marker; - for (Map.Entry entry : markerMap.entrySet()) { - bucketStartOffsets.merge(entry.getKey(), entry.getValue(), Math::min); - } - } else if (marker != null) { - LOG.warn( - "Ignoring unrecognized recovery marker type: {}", - marker.getClass().getName()); - } - } - + Map bucketStartOffsets = mergeRecoveryMarkerOffsets(recoveryMarkers); if (bucketStartOffsets.isEmpty()) { LOG.info("No valid bucket offsets in recovery markers, skipping state rebuild"); return; } - // Capture the latest offsets as the stopping point for each bucket Map bucketEndOffsets = getBucketEndOffsets(); - // Capture the earliest available offsets so we can skip buckets whose data has - // been fully cleaned by log retention (earliestOffset >= endOffset). Map bucketEarliestOffsets = getBucketEarliestOffsets(); LOG.debug( "Rebuild window: startOffsets={}, earliestOffsets={}, endOffsets={}", @@ -269,120 +256,135 @@ public void rebuildState(List recoveryMarkers) { bucketEndOffsets); try (LogScanner scanner = table.newScan().createLogScanner()) { - // Track which buckets still need to be consumed - Map remainingBuckets = new HashMap<>(); - for (Map.Entry entry : bucketStartOffsets.entrySet()) { - int bucket = entry.getKey(); - long startOffset = entry.getValue(); - Long endOffset = bucketEndOffsets.get(bucket); - - // Bucket referenced in recovery marker does not exist in current table - if (endOffset == null) { - LOG.warn( - "Bucket {} referenced in recovery marker does not exist " - + "in current table, state recovery for this bucket is skipped", - bucket); - continue; - } + Map remainingBuckets = + subscribeEffectiveOffsets( + scanner, bucketStartOffsets, bucketEndOffsets, bucketEarliestOffsets); + LOG.debug("Subscribed buckets for rebuild: {}", remainingBuckets); - // No new data since checkpoint (includes empty buckets that never had writes) - if (endOffset <= startOffset) { - LOG.info( - "Skipping bucket {} for rebuild: no new data " - + "(endOffset={} <= startOffset={})", - bucket, - endOffset, - startOffset); - continue; - } + pollAndReplay(scanner, remainingBuckets); + } catch (Exception e) { + throw new RuntimeException("Failed to rebuild state from Fluss log table", e); + } + + LOG.info("Completed rebuilding state, recovered {} states", actionStates.size()); + } - // Check if retention has cleaned data in the recovery window - Long earliestOffset = bucketEarliestOffsets.get(bucket); - long effectiveStart = startOffset; - if (earliestOffset != null && earliestOffset > startOffset) { - effectiveStart = earliestOffset; - if (effectiveStart >= endOffset) { - // All data in recovery window has been cleaned by retention - LOG.warn( - "Bucket {} state recovery failed: all data between offset {} " - + "and {} has been cleaned by log retention " - + "(earliest available: {})", - bucket, - startOffset, - endOffset, - earliestOffset); - continue; - } - // Partial data loss: some state between startOffset and earliestOffset is gone - LOG.warn( - "Bucket {} partial state loss: data between offset {} and {} " - + "has been cleaned by log retention, " - + "recovering from offset {} instead", - bucket, - startOffset, - earliestOffset, - effectiveStart); + /** + * Merges recovery markers into a per-bucket start offset map. For each bucket, the minimum + * offset across all markers is used to cover the widest recovery window. + */ + private Map mergeRecoveryMarkerOffsets(List recoveryMarkers) { + Map bucketStartOffsets = new HashMap<>(); + for (Object marker : recoveryMarkers) { + if (marker instanceof Map) { + @SuppressWarnings("unchecked") + Map markerMap = (Map) marker; + for (Map.Entry entry : markerMap.entrySet()) { + bucketStartOffsets.merge(entry.getKey(), entry.getValue(), Math::min); } + } else if (marker != null) { + LOG.warn( + "Ignoring unrecognized recovery marker type: {}", + marker.getClass().getName()); + } + } + return bucketStartOffsets; + } - scanner.subscribe(bucket, effectiveStart); - remainingBuckets.put(bucket, endOffset); + /** + * Validates effective offsets for each bucket and subscribes the scanner. Buckets with no new + * data are skipped; buckets with data loss (retention cleaned the recovery window) cause an + * immediate failure. + * + * @return a map of bucket-id to end-offset for buckets that need to be scanned + */ + private Map subscribeEffectiveOffsets( + LogScanner scanner, + Map bucketStartOffsets, + Map bucketEndOffsets, + Map bucketEarliestOffsets) { + Map remainingBuckets = new HashMap<>(); + for (Map.Entry entry : bucketStartOffsets.entrySet()) { + int bucket = entry.getKey(); + long startOffset = entry.getValue(); + long endOffset = bucketEndOffsets.get(bucket); + long earliestOffset = bucketEarliestOffsets.get(bucket); + + // No new data since checkpoint (includes empty buckets that never had writes: + // endOffset=0, startOffset=0) + if (endOffset <= startOffset) { + LOG.info( + "Skipping bucket {} for rebuild: no new data " + + "(endOffset={} <= startOffset={})", + bucket, + endOffset, + startOffset); + continue; + } + + // Retention has cleaned data in the recovery window — data loss + if (earliestOffset > startOffset) { + throw new RuntimeException( + String.format( + "Data loss detected for bucket %s: " + + "retention has cleaned data in the recovery window " + + "(earliestOffset=%d > startOffset=%d). " + + "Cannot recover state safely.", + bucket, earliestOffset, startOffset)); } - LOG.debug("Subscribed buckets for rebuild: {}", remainingBuckets); - while (!remainingBuckets.isEmpty()) { - - ScanRecords records = scanner.poll(POLL_TIMEOUT); - for (TableBucket bucket : records.buckets()) { - Long endOffset = remainingBuckets.get(bucket.getBucket()); - if (endOffset == null) { - continue; - } - // Track the highest offset seen in this batch (including skipped records - // beyond endOffset) so that we can detect when the bucket has been fully - // consumed even if the last records are past our target window. - long lastSeenOffset = -1; - for (ScanRecord record : records.records(bucket)) { - lastSeenOffset = record.logOffset(); - if (record.logOffset() >= endOffset) { - continue; - } - InternalRow row = record.getRow(); - String stateKey = row.getString(COL_STATE_KEY).toString(); - byte[] payload = row.getBytes(COL_STATE_PAYLOAD); - try { - ActionState state = ActionStateSerde.deserialize(payload); - actionStates.put(stateKey, state); - } catch (Exception e) { - LOG.warn( - "Failed to deserialize action state for key: {}, skipping", - stateKey, - e); - } - } - // Remove bucket if the highest seen offset has reached or passed the end - if (lastSeenOffset + 1 >= endOffset) { - remainingBuckets.remove(bucket.getBucket()); - } + scanner.subscribe(bucket, startOffset); + remainingBuckets.put(bucket, endOffset); + } + return remainingBuckets; + } + + /** + * Polls the scanner and replays deserialized action states until all subscribed buckets have + * been fully consumed up to their end offsets. + */ + private void pollAndReplay(LogScanner scanner, Map remainingBuckets) { + while (!remainingBuckets.isEmpty()) { + ScanRecords records = scanner.poll(POLL_TIMEOUT); + for (TableBucket bucket : records.buckets()) { + Long endOffset = remainingBuckets.get(bucket.getBucket()); + long lastSeenOffset = replayRecords(records.records(bucket), endOffset); + if (lastSeenOffset + 1 >= endOffset) { + remainingBuckets.remove(bucket.getBucket()); } } - } catch (Exception e) { - throw new RuntimeException("Failed to rebuild state from Fluss log table", e); } + } - LOG.info("Completed rebuilding state, recovered {} states", actionStates.size()); + /** + * Replays records from a single bucket, deserializing and applying each action state. Returns + * the highest log offset seen (including records past endOffset), used to detect bucket + * completion. + */ + private long replayRecords(Iterable records, long endOffset) { + long lastSeenOffset = -1; + for (ScanRecord record : records) { + lastSeenOffset = record.logOffset(); + if (record.logOffset() >= endOffset) { + break; + } + InternalRow row = record.getRow(); + String stateKey = row.getString(COL_STATE_KEY).toString(); + byte[] payload = row.getBytes(COL_STATE_PAYLOAD); + ActionState state = ActionStateSerde.deserialize(payload); + actionStates.put(stateKey, state); + } + return lastSeenOffset; } - @SuppressWarnings("unchecked") private Map getBucketEndOffsets() { return getBucketOffsets(new OffsetSpec.LatestSpec()); } - @SuppressWarnings("unchecked") private Map getBucketEarliestOffsets() { return getBucketOffsets(new OffsetSpec.EarliestSpec()); } - @SuppressWarnings("unchecked") private Map getBucketOffsets(OffsetSpec offsetSpec) { int numBuckets = table.getTableInfo().getNumBuckets(); try (Admin admin = connection.getAdmin()) { @@ -390,8 +392,7 @@ private Map getBucketOffsets(OffsetSpec offsetSpec) { for (int b = 0; b < numBuckets; b++) { buckets.add(b); } - return (Map) - admin.listOffsets(tablePath, buckets, offsetSpec).all().get(); + return admin.listOffsets(tablePath, buckets, offsetSpec).all().get(); } catch (Exception e) { throw new RuntimeException("Failed to get offsets for Fluss table: " + tablePath, e); } @@ -437,14 +438,11 @@ public void pruneState(Object key, long seqNum) { @Override public void close() throws Exception { - try { - if (table != null) { - table.close(); - } - } finally { - if (connection != null) { - connection.close(); - } + if (table != null) { + table.close(); + } + if (connection != null) { + connection.close(); } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java index 038d8dc74..2a90c1f15 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/ActionStateUtilTest.java @@ -35,7 +35,7 @@ public class ActionStateUtilTest { public void testGenerateKeyConsistency() throws Exception { // Create test data Object key = "consistency-test"; - Action action = new TestAction("consistency-action"); + Action action = new NoOpAction("consistency-action"); InputEvent inputEvent = new InputEvent("same-input"); InputEvent inputEvent2 = new InputEvent("same-input"); @@ -51,7 +51,7 @@ public void testGenerateKeyConsistency() throws Exception { public void testGenerateKeyDifferentInputs() throws Exception { // Create test data Object key = "diff-test"; - Action action = new TestAction("diff-action"); + Action action = new NoOpAction("diff-action"); InputEvent inputEvent1 = new InputEvent("input1"); InputEvent inputEvent2 = new InputEvent("input2"); @@ -65,7 +65,7 @@ public void testGenerateKeyDifferentInputs() throws Exception { @Test public void testGenerateKeyWithNullKey() throws Exception { - Action action = new TestAction("test-action"); + Action action = new NoOpAction("test-action"); InputEvent inputEvent = new InputEvent("test-input"); assertThrows( @@ -90,7 +90,7 @@ public void testGenerateKeyWithNullAction() { @Test public void testGenerateKeyWithNullEvent() throws Exception { Object key = "test-key"; - Action action = new TestAction("test-action"); + Action action = new NoOpAction("test-action"); assertThrows( NullPointerException.class, @@ -103,7 +103,7 @@ public void testGenerateKeyWithNullEvent() throws Exception { public void testParseKeyValidKey() throws Exception { // Create test data and generate a key Object key = "test-key"; - Action action = new TestAction("test-action"); + Action action = new NoOpAction("test-action"); InputEvent inputEvent = new InputEvent("test-input"); long seqNum = 123; @@ -125,7 +125,7 @@ public void testParseKeyValidKey() throws Exception { public void testParseKeyRoundTrip() throws Exception { // Test that generate -> parse -> values match original inputs Object originalKey = "round-trip-test"; - Action action = new TestAction("round-trip-action"); + Action action = new NoOpAction("round-trip-action"); InputEvent inputEvent = new InputEvent("round-trip-input"); long seqNum = 456; @@ -173,7 +173,7 @@ public void testParseKeyWithInvalidFormat() { public void testParseKeyWithSpecialCharacters() throws Exception { // Test with keys containing special characters (but not the separator) Object key = "key-with-special@chars#123"; - Action action = new TestAction("action-with-special@chars"); + Action action = new NoOpAction("action-with-special@chars"); InputEvent inputEvent = new InputEvent("input-with-special@chars"); long seqNum = 789; @@ -187,7 +187,7 @@ public void testParseKeyWithSpecialCharacters() throws Exception { @Test public void testParseKeyConsistencyWithDifferentKeys() throws Exception { // Generate keys with different inputs and verify parsing consistency - Action action = new TestAction("consistency-action"); + Action action = new NoOpAction("consistency-action"); InputEvent inputEvent = new InputEvent("consistency-input"); String key1 = ActionStateUtil.generateKey("key1", 100, action, inputEvent); diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java index 947fa1414..0d4ddd062 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreIT.java @@ -63,7 +63,7 @@ void setUp() throws Exception { // Wait for table to be ready in the cluster waitForTableReady(); - testAction = new TestAction("test-action"); + testAction = new NoOpAction("test-action"); testEvent = new InputEvent("test-data"); } @@ -238,6 +238,65 @@ void testPruneWorksAfterRecovery() throws Exception { } } + // ==================== Multi-bucket ==================== + + @Test + @SuppressWarnings("unchecked") + void testMultiBucketRecovery() throws Exception { + // Use a separate database/table with 4 buckets to test multi-bucket scenario + String multiDb = "test_flink_agents_multi"; + String multiTable = "action_state_multi"; + AgentConfiguration multiConfig = createAgentConfiguration(multiDb, multiTable, 4); + FlussActionStateStore multiStore = new FlussActionStateStore(multiConfig); + try { + waitForTableReady(multiDb, multiTable); + + // Write states with different keys (likely distributed across buckets) + Action action1 = new NoOpAction("multi-action"); + for (int i = 0; i < 10; i++) { + String key = "multi-key-" + i; + multiStore.put(key, 1L, action1, testEvent, new ActionState(testEvent)); + } + + // Verify all states are retrievable + for (int i = 0; i < 10; i++) { + String key = "multi-key-" + i; + assertThat(multiStore.get(key, 1L, action1, testEvent)).isNotNull(); + } + + // Recovery marker should contain all 4 buckets + Object marker = multiStore.getRecoveryMarker(); + assertThat(marker).isInstanceOf(Map.class); + Map bucketOffsets = (Map) marker; + assertThat(bucketOffsets).hasSize(4); + + // Write more data after marker + for (int i = 10; i < 15; i++) { + String key = "multi-key-" + i; + multiStore.put(key, 1L, action1, testEvent, new ActionState(testEvent)); + } + multiStore.close(); + + // Recover into a new store instance + FlussActionStateStore recoveredStore = new FlussActionStateStore(multiConfig); + try { + recoveredStore.rebuildState(List.of(marker)); + + // Data written after marker should be recovered + for (int i = 10; i < 15; i++) { + String key = "multi-key-" + i; + assertThat(recoveredStore.get(key, 1L, action1, testEvent)).isNotNull(); + } + } finally { + recoveredStore.close(); + } + } finally { + multiStore.close(); + // Prevent double-close in tearDown + store = null; + } + } + // ==================== Helpers ==================== private AgentConfiguration createAgentConfiguration() { diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreSaslIT.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreSaslIT.java new file mode 100644 index 000000000..547a83fb4 --- /dev/null +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreSaslIT.java @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.agents.runtime.actionstate; + +import org.apache.flink.agents.api.InputEvent; +import org.apache.flink.agents.plan.AgentConfiguration; +import org.apache.flink.agents.plan.actions.Action; +import org.apache.fluss.config.ConfigOptions; +import org.apache.fluss.config.Configuration; +import org.apache.fluss.server.testutils.FlussClusterExtension; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; + +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_DATABASE; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_ACTION_STATE_TABLE_BUCKETS; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_BOOTSTRAP_SERVERS; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_MECHANISM; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_PASSWORD; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SASL_USERNAME; +import static org.apache.flink.agents.api.configuration.AgentConfigOptions.FLUSS_SECURITY_PROTOCOL; +import static org.assertj.core.api.Assertions.assertThat; + +/** + * Integration tests for {@link FlussActionStateStore} with SASL/PLAIN authentication against an + * embedded Fluss cluster. + */ +public class FlussActionStateStoreSaslIT { + + private static final String TEST_DATABASE = "test_flink_agents_sasl"; + private static final String TEST_TABLE = "action_state_sasl_it"; + private static final String TEST_KEY = "test-key"; + private static final String SASL_USERNAME = "testuser"; + private static final String SASL_PASSWORD = "testpass"; + + @RegisterExtension + static final FlussClusterExtension FLUSS_CLUSTER = + FlussClusterExtension.builder() + .setNumOfTabletServers(1) + .setCoordinatorServerListeners("FLUSS://localhost:0, CLIENT://localhost:0") + .setTabletServerListeners("FLUSS://localhost:0, CLIENT://localhost:0") + .setClusterConf(createSaslClusterConfig()) + .build(); + + private FlussActionStateStore store; + + @BeforeEach + void setUp() throws Exception { + AgentConfiguration config = createSaslAgentConfiguration(); + store = new FlussActionStateStore(config); + } + + @AfterEach + void tearDown() throws Exception { + if (store != null) { + store.close(); + } + } + + @Test + void testPutAndGetWithSaslAuth() throws Exception { + Action testAction = new NoOpAction("sasl-action"); + InputEvent testEvent = new InputEvent("sasl-data"); + ActionState state = new ActionState(testEvent); + + store.put(TEST_KEY, 1L, testAction, testEvent, state); + ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); + + assertThat(retrieved).isNotNull(); + assertThat(retrieved.getTaskEvent()).isEqualTo(testEvent); + } + + @Test + void testRecoveryWithSaslAuth() throws Exception { + Action testAction = new NoOpAction("sasl-recovery-action"); + InputEvent testEvent = new InputEvent("sasl-recovery-data"); + + // Write data and capture recovery marker + store.put(TEST_KEY, 1L, testAction, testEvent, new ActionState(testEvent)); + Object marker = store.getRecoveryMarker(); + + // Write more data after marker + store.put(TEST_KEY, 2L, testAction, testEvent, new ActionState(testEvent)); + store.close(); + + // Recover into a new store instance with SASL + FlussActionStateStore recoveredStore = + new FlussActionStateStore(createSaslAgentConfiguration()); + try { + recoveredStore.rebuildState(java.util.List.of(marker)); + + // Data written after marker should be recovered + assertThat(recoveredStore.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); + // Data written before marker should NOT be in the rebuilt cache + assertThat(recoveredStore.get(TEST_KEY, 1L, testAction, testEvent)).isNull(); + } finally { + recoveredStore.close(); + store = null; + } + } + + @Test + void testMultipleWritesWithSaslAuth() throws Exception { + Action testAction = new NoOpAction("sasl-multi-action"); + InputEvent testEvent = new InputEvent("sasl-multi-data"); + + for (int i = 0; i < 5; i++) { + store.put("key-" + i, (long) i, testAction, testEvent, new ActionState(testEvent)); + } + + for (int i = 0; i < 5; i++) { + assertThat(store.get("key-" + i, (long) i, testAction, testEvent)).isNotNull(); + } + } + + // ==================== Helpers ==================== + + private static Configuration createSaslClusterConfig() { + Configuration conf = new Configuration(); + conf.setString(ConfigOptions.SERVER_SECURITY_PROTOCOL_MAP.key(), "CLIENT:sasl"); + conf.setString("security.sasl.enabled.mechanisms", "plain"); + conf.setString( + "security.sasl.plain.jaas.config", + "org.apache.fluss.security.auth.sasl.plain.PlainLoginModule required " + + " user_" + + SASL_USERNAME + + "=\"" + + SASL_PASSWORD + + "\";"); + return conf; + } + + private AgentConfiguration createSaslAgentConfiguration() { + AgentConfiguration config = new AgentConfiguration(); + String bootstrapServers = + String.join( + ",", + FLUSS_CLUSTER + .getClientConfig("CLIENT") + .get(ConfigOptions.BOOTSTRAP_SERVERS)); + config.set(FLUSS_BOOTSTRAP_SERVERS, bootstrapServers); + config.set(FLUSS_ACTION_STATE_DATABASE, TEST_DATABASE); + config.set(FLUSS_ACTION_STATE_TABLE, TEST_TABLE); + config.set(FLUSS_ACTION_STATE_TABLE_BUCKETS, 1); + config.set(FLUSS_SECURITY_PROTOCOL, "SASL"); + config.set(FLUSS_SASL_MECHANISM, "PLAIN"); + config.set(FLUSS_SASL_USERNAME, SASL_USERNAME); + config.set(FLUSS_SASL_PASSWORD, SASL_PASSWORD); + return config; + } +} diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java index 54b34dc1c..308ad3d42 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStoreTest.java @@ -27,11 +27,15 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.io.IOException; +import java.util.Collections; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.verify; @@ -60,7 +64,7 @@ void setUp() throws Exception { new FlussActionStateStore( actionStates, mock(Connection.class), mock(Table.class), mockWriter); - testAction = new TestAction("test-action"); + testAction = new NoOpAction("test-action"); testEvent = new InputEvent("test data"); testActionState = new ActionState(testEvent); } @@ -77,86 +81,93 @@ void testPutActionState() throws Exception { } @Test - void testGetNonExistentActionState() throws Exception { - actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); - actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent), testActionState); - actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent), testActionState); - actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 4L, testAction, testEvent), testActionState); + void testPutActionStateWriterFailure() throws Exception { + when(mockWriter.append(any(InternalRow.class))) + .thenReturn(CompletableFuture.failedFuture(new IOException("connection lost"))); + + FlussActionStateStore failStore = + new FlussActionStateStore( + actionStates, mock(Connection.class), mock(Table.class), mockWriter); - // Request a key with a different action triggers divergence cleanup - store.get(TEST_KEY, 2L, new TestAction("test-1"), testEvent); + assertThatThrownBy( + () -> failStore.put(TEST_KEY, 1L, testAction, testEvent, testActionState)) + .isInstanceOf(Exception.class); - assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); - assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); - assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNull(); - assertThat(store.get(TEST_KEY, 4L, testAction, testEvent)).isNull(); + // Cache should NOT be updated on write failure + String stateKey = ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent); + assertThat(actionStates).doesNotContainKey(stateKey); } @Test - void testGetActionStateWithDiverge() throws Exception { + void testGetTriggersDivergenceCleanup() throws Exception { actionStates.put( ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); actionStates.put( ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent), testActionState); // diverge: same key+seqNum, different action actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 2L, new TestAction("test-2"), testEvent), + ActionStateUtil.generateKey(TEST_KEY, 2L, new NoOpAction("test-2"), testEvent), testActionState); actionStates.put( ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent), testActionState); - actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 4L, testAction, testEvent), testActionState); - store.get(TEST_KEY, 2L, testAction, testEvent); + store.get(TEST_KEY, 2L, new NoOpAction("test-1"), testEvent); + // Divergence detected at seqNum 2 → removeIf clears seqNum > 2 assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNull(); - assertThat(store.get(TEST_KEY, 4L, testAction, testEvent)).isNull(); } + // ==================== rebuildState tests ==================== + @Test - void testPruneState() throws Exception { + void testRebuildStateSkipsOnEmptyMarkers() throws Exception { actionStates.put( ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); + + store.rebuildState(Collections.emptyList()); + + // Empty markers → rebuild is skipped, cache is NOT cleared + assertThat(actionStates).isNotEmpty(); + } + + @Test + void testRebuildStateSkipsOnNonMapMarker() throws Exception { actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent), testActionState); + ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); + + // A non-Map marker is ignored, resulting in empty bucketStartOffsets. + // Note: rebuildState clears the cache before checking offsets, + // so the cache will be empty even though no actual rebuild occurs. + store.rebuildState(List.of("invalid-marker")); + + assertThat(actionStates).isEmpty(); + } + + @Test + void testRebuildStateSkipsOnEmptyBucketOffsets() throws Exception { actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent), testActionState); + ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent), testActionState); - assertThat(store.get(TEST_KEY, 1L, testAction, testEvent)).isNotNull(); - assertThat(store.get(TEST_KEY, 2L, testAction, testEvent)).isNotNull(); - assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); - - // Prune states up to sequence number 2 - store.pruneState(TEST_KEY, 2L); - - assertThat( - actionStates.get( - ActionStateUtil.generateKey(TEST_KEY, 1L, testAction, testEvent))) - .isNull(); - assertThat( - actionStates.get( - ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent))) - .isNull(); - assertThat(store.get(TEST_KEY, 3L, testAction, testEvent)).isNotNull(); + // Empty map marker → no valid bucket offsets. + // Same as above: cache is cleared before the early-return check. + store.rebuildState(List.of(Map.of())); + + assertThat(actionStates).isEmpty(); } @Test - void testActionStateUpdates() throws Exception { - store.put(TEST_KEY, 1L, testAction, testEvent, testActionState); + void testCloseClosesResources() throws Exception { + Table mockTable = mock(Table.class); + Connection mockConnection = mock(Connection.class); + + FlussActionStateStore closeableStore = + new FlussActionStateStore(actionStates, mockConnection, mockTable, mockWriter); - // Update the same key with a new state - InputEvent updatedEvent = new InputEvent("updated data"); - ActionState updatedState = new ActionState(updatedEvent); - store.put(TEST_KEY, 1L, testAction, testEvent, updatedState); + closeableStore.close(); - ActionState retrieved = store.get(TEST_KEY, 1L, testAction, testEvent); - assertThat(retrieved).isNotNull(); - assertThat(retrieved.getTaskEvent()).isEqualTo(updatedEvent); + verify(mockTable).close(); + verify(mockConnection).close(); } } diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java index 33b4193dd..7cf829c00 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/KafkaActionStateStoreTest.java @@ -74,7 +74,7 @@ void setUp() throws Exception { TEST_TOPIC); // Create test objects - testAction = new TestAction("test-action"); + testAction = new NoOpAction("test-action"); testEvent = new InputEvent("test data"); testActionState = new ActionState(testEvent); } @@ -105,7 +105,7 @@ void testGetNonExistentActionState() throws Exception { actionStates.put( ActionStateUtil.generateKey(TEST_KEY, 4L, testAction, testEvent), testActionState); - actionStateStore.get(TEST_KEY, 2L, new TestAction("test-1"), testEvent); + actionStateStore.get(TEST_KEY, 2L, new NoOpAction("test-1"), testEvent); assertNotNull(actionStateStore.get(TEST_KEY, 1L, testAction, testEvent)); assertNotNull(actionStateStore.get(TEST_KEY, 2L, testAction, testEvent)); @@ -121,7 +121,7 @@ void testGetActionStateWithDiverge() throws Exception { ActionStateUtil.generateKey(TEST_KEY, 2L, testAction, testEvent), testActionState); // diverge here actionStates.put( - ActionStateUtil.generateKey(TEST_KEY, 2L, new TestAction("test-2"), testEvent), + ActionStateUtil.generateKey(TEST_KEY, 2L, new NoOpAction("test-2"), testEvent), testActionState); actionStates.put( ActionStateUtil.generateKey(TEST_KEY, 3L, testAction, testEvent), testActionState); diff --git a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/TestAction.java b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/NoOpAction.java similarity index 91% rename from runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/TestAction.java rename to runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/NoOpAction.java index de097c5a2..cfc1f5d20 100644 --- a/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/TestAction.java +++ b/runtime/src/test/java/org/apache/flink/agents/runtime/actionstate/NoOpAction.java @@ -26,17 +26,17 @@ import java.util.List; /** A no-op {@link Action} for use in action state store tests. */ -public class TestAction extends Action { +public class NoOpAction extends Action { public static void doNothing(Event event, RunnerContext context) { // No operation } - public TestAction(String name) throws Exception { + public NoOpAction(String name) throws Exception { super( name, new JavaFunction( - TestAction.class.getName(), + NoOpAction.class.getName(), "doNothing", new Class[] {Event.class, RunnerContext.class}), List.of(InputEvent.class.getName())); From ff1130c844b70ab6fea484b1c766207e8ca55662 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Thu, 30 Apr 2026 11:49:21 +0800 Subject: [PATCH 8/9] fix(actionstate): Unsubscribe from the processed bucket. --- .../flink/agents/runtime/actionstate/FlussActionStateStore.java | 1 + 1 file changed, 1 insertion(+) diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java index ebbf8a40c..a9327c042 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -351,6 +351,7 @@ private void pollAndReplay(LogScanner scanner, Map remainingBucke long lastSeenOffset = replayRecords(records.records(bucket), endOffset); if (lastSeenOffset + 1 >= endOffset) { remainingBuckets.remove(bucket.getBucket()); + scanner.unsubscribe(bucket.getBucket()); } } } From e96f36786c9330cbac4a9027c9c3ce17f32ea147 Mon Sep 17 00:00:00 2001 From: Junbo Wang Date: Thu, 30 Apr 2026 16:31:44 +0800 Subject: [PATCH 9/9] fix(runtime): Correct data recovery logic in FlussActionStateStore. --- .../actionstate/FlussActionStateStore.java | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java index a9327c042..e91db0630 100644 --- a/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java +++ b/runtime/src/main/java/org/apache/flink/agents/runtime/actionstate/FlussActionStateStore.java @@ -78,6 +78,8 @@ public class FlussActionStateStore implements ActionStateStore { private static final Duration POLL_TIMEOUT = Duration.ofSeconds(1); + private static final String SECURITY_PROTOCOL_PLAINTEXT = "PLAINTEXT"; + // Column names in the Fluss table schema private static final String COL_NAME_STATE_KEY = "state_key"; private static final String COL_NAME_STATE_PAYLOAD = "state_payload"; @@ -134,7 +136,7 @@ public FlussActionStateStore(AgentConfiguration agentConfiguration) { // cause the Fluss client to attempt an unwanted SASL handshake. String securityProtocol = agentConfiguration.get(FLUSS_SECURITY_PROTOCOL); flussConf.setString(CLIENT_SECURITY_PROTOCOL, securityProtocol); - if (!"PLAINTEXT".equalsIgnoreCase(securityProtocol)) { + if (!SECURITY_PROTOCOL_PLAINTEXT.equalsIgnoreCase(securityProtocol)) { flussConf.setString( CLIENT_SASL_MECHANISM, agentConfiguration.get(FLUSS_SASL_MECHANISM)); @@ -312,25 +314,25 @@ private Map subscribeEffectiveOffsets( // No new data since checkpoint (includes empty buckets that never had writes: // endOffset=0, startOffset=0) - if (endOffset <= startOffset) { + if (endOffset == startOffset) { LOG.info( "Skipping bucket {} for rebuild: no new data " - + "(endOffset={} <= startOffset={})", + + "(endOffset={} = startOffset={})", bucket, endOffset, startOffset); continue; } - // Retention has cleaned data in the recovery window — data loss - if (earliestOffset > startOffset) { - throw new RuntimeException( + // Data loss: retention cleaned the recovery window, or log was truncated/reset + if (earliestOffset > startOffset || endOffset < startOffset) { + throw new IllegalStateException( String.format( - "Data loss detected for bucket %s: " - + "retention has cleaned data in the recovery window " - + "(earliestOffset=%d > startOffset=%d). " - + "Cannot recover state safely.", - bucket, earliestOffset, startOffset)); + "Data loss detected for bucket %d: required data is no longer " + + "available in the log (startOffset=%d, endOffset=%d, " + + "earliestOffset=%d). Increase log retention or reduce " + + "checkpoint interval.", + bucket, startOffset, endOffset, earliestOffset)); } scanner.subscribe(bucket, startOffset); @@ -439,11 +441,14 @@ public void pruneState(Object key, long seqNum) { @Override public void close() throws Exception { - if (table != null) { - table.close(); - } - if (connection != null) { - connection.close(); + try { + if (table != null) { + table.close(); + } + } finally { + if (connection != null) { + connection.close(); + } } }