diff --git a/.build/checkstyle.xml b/.build/checkstyle.xml
index 7e91df304cb9..99faf7335e1b 100644
--- a/.build/checkstyle.xml
+++ b/.build/checkstyle.xml
@@ -58,15 +58,9 @@
-
-
-
-
-
-
-
-
-
+
+
+
diff --git a/.gitmodules b/.gitmodules
index 616dacf610a7..c8ef94c8236e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,4 +1,4 @@
[submodule "modules/accord"]
path = modules/accord
- url = https://github.com/apache/cassandra-accord.git
- branch = trunk
+ url = https://github.com/belliottsmith/cassandra-accord.git
+ branch = tail-latency
diff --git a/CHANGES.txt b/CHANGES.txt
index 96177c865671..0e6adb1f7a78 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,7 @@
6.0-alpha2
+ * Accord: Tail Latency Improvements (CASSANDRA-21361)
+ * Artificial Latency Injection (CASSANDRA-17024)
+ * Accord: Clean Shutdown/Restart, Rebootstrap, et al (CASSANDRA-21355)
* Reduce memory allocations in SelectStatement.getQuery (CASSANDRA-21351)
* Avoid allocation by getFunctions in SelectStatement.authorize (CASSANDRA-21347)
* Avoid unit conversion in DatabaseDescriptor.getMaxValueSize() for every deserializing Cell (CASSANDRA-21295)
diff --git a/modules/accord b/modules/accord
index d3c593558908..e11a4e8efdc6 160000
--- a/modules/accord
+++ b/modules/accord
@@ -1 +1 @@
-Subproject commit d3c5935589082f784b10dbee4d89691612f680eb
+Subproject commit e11a4e8efdc60118a16654ddd6c8f7fccb1bb425
diff --git a/src/java/org/apache/cassandra/concurrent/ExecutorLocals.java b/src/java/org/apache/cassandra/concurrent/ExecutorLocals.java
index a5720dd37cf1..55c6ee24642a 100644
--- a/src/java/org/apache/cassandra/concurrent/ExecutorLocals.java
+++ b/src/java/org/apache/cassandra/concurrent/ExecutorLocals.java
@@ -33,7 +33,7 @@
*/
public class ExecutorLocals implements WithResources, Closeable
{
- private static final ExecutorLocals none = new ExecutorLocals(null, null);
+ private static final ExecutorLocals none = new ExecutorLocals(null, null, false);
private static final FastThreadLocal locals = new FastThreadLocal()
{
@Override
@@ -45,20 +45,23 @@ protected ExecutorLocals initialValue()
public static class Impl
{
- protected static void set(TraceState traceState, ClientWarn.State clientWarnState)
+ @SuppressWarnings("resource")
+ protected static void set(TraceState traceState, ClientWarn.State clientWarnState, boolean eligibleForArtificialLatency)
{
- if (traceState == null && clientWarnState == null) locals.set(none);
- else locals.set(new ExecutorLocals(traceState, clientWarnState));
+ if (traceState == null && clientWarnState == null && !eligibleForArtificialLatency) locals.set(none);
+ else locals.set(new ExecutorLocals(traceState, clientWarnState, eligibleForArtificialLatency));
}
}
public final TraceState traceState;
public final ClientWarn.State clientWarnState;
+ public final boolean eligibleForArtificialLatency;
- protected ExecutorLocals(TraceState traceState, ClientWarn.State clientWarnState)
+ protected ExecutorLocals(TraceState traceState, ClientWarn.State clientWarnState, boolean eligibleForArtificialLatency)
{
this.traceState = traceState;
this.clientWarnState = clientWarnState;
+ this.eligibleForArtificialLatency = eligibleForArtificialLatency;
}
/**
@@ -82,7 +85,7 @@ public static WithResources propagate()
public static ExecutorLocals create(TraceState traceState)
{
ExecutorLocals current = locals.get();
- return current.traceState == traceState ? current : new ExecutorLocals(traceState, current.clientWarnState);
+ return current.traceState == traceState ? current : new ExecutorLocals(traceState, current.clientWarnState, current.eligibleForArtificialLatency);
}
public static void clear()
diff --git a/src/java/org/apache/cassandra/config/AccordSpec.java b/src/java/org/apache/cassandra/config/AccordConfig.java
similarity index 75%
rename from src/java/org/apache/cassandra/config/AccordSpec.java
rename to src/java/org/apache/cassandra/config/AccordConfig.java
index 9965a52e594e..1ec16240a03e 100644
--- a/src/java/org/apache/cassandra/config/AccordSpec.java
+++ b/src/java/org/apache/cassandra/config/AccordConfig.java
@@ -22,23 +22,28 @@
import com.fasterxml.jackson.annotation.JsonIgnore;
+import accord.api.ProtocolModifiers.CleanCfkBefore;
+import accord.api.ProtocolModifiers.CoordinatorBacklogExecution;
+import accord.api.ProtocolModifiers.FastExecution;
+import accord.api.ProtocolModifiers.ReplicaExecution;
+import accord.api.ProtocolModifiers.SendStableMessages;
+import accord.primitives.TxnId;
import accord.utils.Invariants;
import org.apache.cassandra.journal.Params;
import org.apache.cassandra.service.accord.serializers.Version;
import org.apache.cassandra.service.consensus.TransactionalMode;
-import static org.apache.cassandra.config.AccordSpec.CatchupMode.NORMAL;
-import static org.apache.cassandra.config.AccordSpec.QueueShardModel.THREAD_POOL_PER_SHARD;
-import static org.apache.cassandra.config.AccordSpec.QueueSubmissionModel.SYNC;
-import static org.apache.cassandra.config.AccordSpec.RangeIndexMode.in_memory;
+import static org.apache.cassandra.config.AccordConfig.CatchupMode.NORMAL;
+import static org.apache.cassandra.config.AccordConfig.QueuePriorityModel.HLC_FIFO;
+import static org.apache.cassandra.config.AccordConfig.QueueShardModel.THREAD_POOL_PER_SHARD;
+import static org.apache.cassandra.config.AccordConfig.QueueSubmissionModel.SYNC;
+import static org.apache.cassandra.config.AccordConfig.RangeIndexMode.in_memory;
-// TODO (expected): rename to AccordConf?
-public class AccordSpec
+public class AccordConfig
{
public volatile boolean enabled = false;
- // TODO (expected): move to JournalSpec
public volatile String journal_directory;
/**
@@ -107,6 +112,37 @@ public enum QueueSubmissionModel
EXEC_ST
}
+ public enum QueuePriorityModel
+ {
+ /**
+ * All work is queued on a first-come first-serve basis.
+ * Overload can lead to more rapid degradation, as later phases of the state machine are delayed
+ * by the arrival of new work.
+ */
+ FIFO,
+
+ /**
+ * If the work has an associated TxnId, prioritise by its HLC (and FIFO otherwise)
+ */
+ HLC_FIFO,
+
+ /**
+ * Prioritise Apply, Stable, Commit, Accept, and PreAccept messages in that order.
+ * Within a given message type, prioritise by HLC.
+ * Other messages will be mixed with PreAccept messages, but using the next counter rather than the HLC of the TxnId.
+ * Note: this can have some performance edge cases for contended keys, as we may process Stable messages for later commands before
+ * we process earlier Accept/Commit, which may delay execution
+ */
+ PHASE_HLC_FIFO,
+
+ /**
+ * Prioritise Apply, Stable, Commit, Accept, and PreAccept messages from the original coordinator only, in that order.
+ * Within a given message type, prioritise by HLC.
+ * Other messages will be mixed with PreAccept messages, but using the next counter rather than the HLC of the TxnId.
+ */
+ ORIG_PHASE_HLC_FIFO
+ }
+
public QueueShardModel queue_shard_model = THREAD_POOL_PER_SHARD;
public QueueSubmissionModel queue_submission_model = SYNC;
@@ -115,6 +151,15 @@ public enum QueueSubmissionModel
*/
public volatile OptionaldPositiveInt queue_shard_count = OptionaldPositiveInt.UNDEFINED;
+ public QueuePriorityModel queue_priority_model = HLC_FIFO;
+
+ // yield to other executor threads after executing this many tasks in a row, if there are waiting threads and tasks
+ public int queue_yield_interval = 100;
+
+ /**
+ * If the HLC is older than this, queue by FIFO instead
+ */
+ public DurationSpec.IntMillisecondsBound queue_priority_age_to_fifo = new DurationSpec.IntMillisecondsBound(500);
/**
* The target number of command stores to create per topology shard.
* This determines the amount of execution parallelism possible for a given table/shard on the host.
@@ -165,6 +210,10 @@ public enum QueueSubmissionModel
public volatile DurationSpec.IntSecondsBound shard_durability_cycle = new DurationSpec.IntSecondsBound(5, TimeUnit.MINUTES);
public volatile DurationSpec.IntSecondsBound global_durability_cycle = new DurationSpec.IntSecondsBound(5, TimeUnit.MINUTES);
+ public volatile DurationSpec.IntSecondsBound topology_watermark_interval = new DurationSpec.IntSecondsBound(60);
+ public volatile boolean topology_sync_propagator_enabled_pre_start = false;
+ public volatile boolean topology_sync_propagator_enabled_post_startup = false;
+
public enum TransactionalRangeMigration
{
auto, explicit
@@ -178,11 +227,6 @@ public enum TransactionalRangeMigration
*/
public volatile TransactionalRangeMigration range_migration = TransactionalRangeMigration.auto;
- public enum RebootstrapMode
- {
- full_repair, truncate_and_stream
- }
-
public enum CatchupMode
{
DISABLED,
@@ -195,15 +239,42 @@ public enum CatchupMode
* default transactional mode for tables created by this node when no transactional mode has been specified in the DDL
*/
public TransactionalMode default_transactional_mode = TransactionalMode.off;
- public boolean ephemeralReadEnabled = true;
+
+ // ******** PROTOCOL MODIFIERS ***********
+
+ public CoordinatorBacklogExecution coordinator_backlog_execution;
+ public TxnId.FastPath permit_fast_path;
+ public Boolean permit_track_stable_medium_path;
+ public Boolean permit_fast_quorum_medium_path;
+ public Boolean always_inform_durable_single_key;
+ public ReplicaExecution replica_execution;
+ public Float replica_execution_distributed_persist_chance;
+ public FastExecution fast_write_execution;
+ public FastExecution fast_read_execution;
+ public CleanCfkBefore clean_cfk_before;
+ public SendStableMessages send_stable;
+ /**
+ * include the least information expected to be necessary in messages -
+ * this is more efficient but may lead to some additional traffic and latency when earlier messages had not arrived
+ */
+ public Boolean send_minimal;
+ // note: simulator incompatible (for now)
+ public Boolean precise_micros;
+
+ public boolean ephemeral_reads = true;
public boolean state_cache_listener_jfr_enabled = false;
public float hard_reject_ratio = 0.5f;
- public int min_soft_reject_count = 10;
- public int max_soft_reject_count = 100;
+ public int min_soft_reject_count = 100;
+ public int max_soft_reject_count = 1000;
public DurationSpec.LongMicrosecondsBound soft_reject_age = new DurationSpec.LongMicrosecondsBound("10s");
public DurationSpec.LongMicrosecondsBound soft_reject_cumulative_age = new DurationSpec.LongMicrosecondsBound("60s");
+
+ public DurationSpec.IntSecondsBound commands_for_key_prune_delta = new DurationSpec.IntSecondsBound(1);
+ public int commands_for_key_prune_interval = 64;
+ public DurationSpec.IntSecondsBound max_conflicts_prune_delta = new DurationSpec.IntSecondsBound(1);
+
public DurationSpec.IntSecondsBound catchup_on_start_success_latency = new DurationSpec.IntSecondsBound(60);
public DurationSpec.IntSecondsBound catchup_on_start_fail_latency = new DurationSpec.IntSecondsBound(900);
public int catchup_on_start_max_attempts = 5;
@@ -215,7 +286,7 @@ public enum CatchupMode
public enum RangeIndexMode { in_memory, journal_sai }
public RangeIndexMode range_index_mode = in_memory;
- public final JournalSpec journal = new JournalSpec();
+ public final JournalConfig journal = new JournalConfig();
public enum MixedTimeSourceHandling
{
@@ -224,7 +295,7 @@ public enum MixedTimeSourceHandling
public volatile MixedTimeSourceHandling mixedTimeSourceHandling = MixedTimeSourceHandling.reject;
- public static class JournalSpec implements Params
+ public static class JournalConfig implements Params
{
public enum ReplayMode
{
@@ -264,11 +335,17 @@ public enum StopMarkerFailurePolicy
*/
EXIT,
+ /**
+ * @deprecated since alpha release, replaced by ALLOW_UNSAFE_STARTUP for consistency with FailurePolicy.ALLOW_UNSAFE_STARTUP
+ */
+ @Deprecated(since="6.0")
+ UNSAFE_STARTUP,
+
/**
* If the start marker exceeds the stop marker startup, assuming the consensus log has been determined complete externally.
* Note this is VERY UNSAFE if you care about isolation guarantees.
*/
- UNSAFE_STARTUP,
+ ALLOW_UNSAFE_STARTUP,
REBOOTSTRAP
}
@@ -289,7 +366,7 @@ public enum StopMarkerFailurePolicy
public Version version = Version.DOWNGRADE_SAFE_VERSION;
public boolean enable_compaction = true;
- public JournalSpec setFlushPeriod(DurationSpec newFlushPeriod)
+ public JournalConfig setFlushPeriod(DurationSpec newFlushPeriod)
{
flushPeriod = newFlushPeriod;
flushCombinedBlockPeriod = Long.MIN_VALUE;
diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
index c72577e7ec1c..4775a8425f9d 100644
--- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
+++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java
@@ -44,6 +44,7 @@ public enum CassandraRelevantProperties
{
ACCORD_AGENT_CLASS("cassandra.test.accord.agent"),
ACCORD_ALLOW_TEST_MODES("cassandra.test.accord.allow_test_modes", "false"),
+ ACCORD_DEBUG_EXECUTION("accord.debug_execution"),
ACCORD_KEY_PARANOIA_COSTFACTOR(Invariants.KEY_PARANOIA_COSTFACTOR),
ACCORD_KEY_PARANOIA_CPU(Invariants.KEY_PARANOIA_CPU),
ACCORD_KEY_PARANOIA_MEMORY(Invariants.KEY_PARANOIA_MEMORY),
@@ -62,6 +63,10 @@ public enum CassandraRelevantProperties
ALLOW_UNSAFE_REPLACE("cassandra.allow_unsafe_replace"),
ALLOW_UNSAFE_TRANSIENT_CHANGES("cassandra.allow_unsafe_transient_changes"),
APPROXIMATE_TIME_PRECISION_MS("cassandra.approximate_time_precision_ms", "2"),
+ ARTIFICIAL_LATENCIES("cassandra.artificial_latencies"),
+ ARTIFICIAL_LATENCIES_UNSAFE("cassandra.artificial_latencies_unsafe"),
+ ARTIFICIAL_LATENCY_LIMIT("cassandra.artificial_latency_limit", "200ms"),
+ ARTIFICIAL_LATENCY_VERBS("cassandra.artificial_latency_verbs"),
ASYNC_PROFILER_ENABLED("cassandra.async_profiler.enabled", "false"),
ASYNC_PROFILER_UNSAFE_MODE("cassandra.async_profiler.unsafe_mode", "false"),
/** 2 ** GENSALT_LOG2_ROUNDS rounds of hashing will be performed. */
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index dd75d910c830..ba50692cfb10 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -165,8 +165,6 @@ public static Set splitCommaDelimited(String src)
@Replaces(oldName = "cas_contention_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true)
public volatile DurationSpec.LongMillisecondsBound cas_contention_timeout = new DurationSpec.LongMillisecondsBound("1800ms");
- public volatile DurationSpec.LongMillisecondsBound accord_preaccept_timeout = new DurationSpec.LongMillisecondsBound("1s");
-
@Replaces(oldName = "truncate_request_timeout_in_ms", converter = Converters.MILLIS_DURATION_LONG, deprecated = true)
public volatile DurationSpec.LongMillisecondsBound truncate_request_timeout = new DurationSpec.LongMillisecondsBound("60000ms");
@@ -1238,7 +1236,7 @@ public enum PaxosOnLinearizabilityViolation
*/
public ParameterizedClass default_compaction = null;
- public final AccordSpec accord = new AccordSpec();
+ public final AccordConfig accord = new AccordConfig();
public static Supplier getOverrideLoadConfig()
{
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index aec9c0267187..24a8384b7a8d 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -126,6 +126,7 @@
import org.apache.cassandra.service.FileSystemOwnershipCheck;
import org.apache.cassandra.service.StartupChecks;
import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.accord.AccordService;
import org.apache.cassandra.service.accord.api.AccordWaitStrategies;
import org.apache.cassandra.service.consensus.TransactionalMode;
import org.apache.cassandra.service.paxos.Paxos;
@@ -574,7 +575,7 @@ private static void applyAll() throws ConfigurationException
applyGuardrails();
- applyAccordProgressLog();
+ applyAccord();
applyStartupChecks();
}
@@ -1370,7 +1371,7 @@ private static void applyGuardrails()
}
}
- private static void applyAccordProgressLog()
+ private static void applyAccord()
{
try
{
@@ -1382,6 +1383,7 @@ private static void applyAccordProgressLog()
{
throw new ConfigurationException("Invalid accord progress log configuration: " + e.getMessage(), e);
}
+ AccordService.applyProtocolModifiers(getAccord());
}
public static StartupChecksConfiguration getStartupChecksConfiguration()
@@ -1663,12 +1665,6 @@ static void checkForLowestAcceptedTimeouts(Config conf)
logInfo("truncate_request_timeout", conf.truncate_request_timeout, LOWEST_ACCEPTED_TIMEOUT);
conf.truncate_request_timeout = LOWEST_ACCEPTED_TIMEOUT;
}
-
- if (conf.accord_preaccept_timeout.toMilliseconds() < LOWEST_ACCEPTED_TIMEOUT.toMilliseconds())
- {
- logInfo("accord_preaccept_timeout", conf.accord_preaccept_timeout, LOWEST_ACCEPTED_TIMEOUT);
- conf.accord_preaccept_timeout = LOWEST_ACCEPTED_TIMEOUT;
- }
}
private static void logInfo(String property, DurationSpec.LongMillisecondsBound actualValue, DurationSpec.LongMillisecondsBound lowestAcceptedValue)
@@ -5634,18 +5630,17 @@ public static void setUseStatementsEnabled(boolean enabled)
}
}
-
- public static AccordSpec getAccord()
+ public static AccordConfig getAccord()
{
return conf.accord;
}
- public static AccordSpec.TransactionalRangeMigration getTransactionalRangeMigration()
+ public static AccordConfig.TransactionalRangeMigration getTransactionalRangeMigration()
{
return conf.accord.range_migration;
}
- public static void setTransactionalRangeMigration(AccordSpec.TransactionalRangeMigration val)
+ public static void setTransactionalRangeMigration(AccordConfig.TransactionalRangeMigration val)
{
conf.accord.range_migration = Preconditions.checkNotNull(val);
}
@@ -5670,12 +5665,12 @@ public static void setAccordTransactionsEnabled(boolean b)
conf.accord.enabled = b;
}
- public static AccordSpec.QueueShardModel getAccordQueueShardModel()
+ public static AccordConfig.QueueShardModel getAccordQueueShardModel()
{
return conf.accord.queue_shard_model;
}
- public static AccordSpec.QueueSubmissionModel getAccordQueueSubmissionModel()
+ public static AccordConfig.QueueSubmissionModel getAccordQueueSubmissionModel()
{
return conf.accord.queue_submission_model;
}
@@ -6278,7 +6273,7 @@ public static void setPaxosRepairRaceWait(boolean paxosRepairRaceWait)
public static boolean getAccordEphemeralReadEnabledEnabled()
{
- return conf.accord.ephemeralReadEnabled;
+ return conf.accord.ephemeral_reads;
}
public static AutoRepairConfig getAutoRepairConfig()
diff --git a/src/java/org/apache/cassandra/cql3/Attributes.java b/src/java/org/apache/cassandra/cql3/Attributes.java
index 2d3238f0bd02..8ee799579099 100644
--- a/src/java/org/apache/cassandra/cql3/Attributes.java
+++ b/src/java/org/apache/cassandra/cql3/Attributes.java
@@ -153,9 +153,9 @@ public int getTimeToLive(QueryOptions options, TableMetadata metadata) throws In
public void collectMarkerSpecification(VariableSpecifications boundNames)
{
if (timestamp != null)
- timestamp.collectMarkerSpecification(boundNames);
+ timestamp.collectMarkerSpecification(boundNames, this);
if (timeToLive != null)
- timeToLive.collectMarkerSpecification(boundNames);
+ timeToLive.collectMarkerSpecification(boundNames, this);
}
public static class Raw
diff --git a/src/java/org/apache/cassandra/cql3/ColumnsExpression.java b/src/java/org/apache/cassandra/cql3/ColumnsExpression.java
index b078fe5196ae..0db17261bf0f 100644
--- a/src/java/org/apache/cassandra/cql3/ColumnsExpression.java
+++ b/src/java/org/apache/cassandra/cql3/ColumnsExpression.java
@@ -386,10 +386,10 @@ public boolean isMapElementExpression()
* @param boundNames the variables specification where to collect the
* bind variables of the map key/collection element in.
*/
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
if (element != null)
- element.collectMarkerSpecification(boundNames);
+ element.collectMarkerSpecification(boundNames, owner);
}
/**
diff --git a/src/java/org/apache/cassandra/cql3/ElementExpression.java b/src/java/org/apache/cassandra/cql3/ElementExpression.java
index 3665f71bb897..5d89c74427df 100644
--- a/src/java/org/apache/cassandra/cql3/ElementExpression.java
+++ b/src/java/org/apache/cassandra/cql3/ElementExpression.java
@@ -140,9 +140,9 @@ public AbstractType> type()
* @param boundNames the variables specification where to collect the
* bind variables of the map key/collection element in.
*/
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
- keyOrIndex.collectMarkerSpecification(boundNames);
+ keyOrIndex.collectMarkerSpecification(boundNames, owner);
}
/**
diff --git a/src/java/org/apache/cassandra/cql3/Json.java b/src/java/org/apache/cassandra/cql3/Json.java
index 7f4bc64f668c..be044f0a1a12 100644
--- a/src/java/org/apache/cassandra/cql3/Json.java
+++ b/src/java/org/apache/cassandra/cql3/Json.java
@@ -86,7 +86,7 @@ public Marker(int bindIndex)
public Prepared prepareAndCollectMarkers(TableMetadata metadata, Collection receivers, VariableSpecifications boundNames)
{
- boundNames.add(bindIndex, makeReceiver(metadata));
+ boundNames.add(bindIndex, makeReceiver(metadata), null);
return new PreparedMarker(bindIndex, receivers);
}
@@ -239,7 +239,7 @@ public DelayedColumnValue(PreparedMarker prepared, ColumnMetadata column, boolea
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
// We've already collected what we should (and in practice this method is never called).
}
diff --git a/src/java/org/apache/cassandra/cql3/Operation.java b/src/java/org/apache/cassandra/cql3/Operation.java
index 646c07e57472..3d8555849b2f 100644
--- a/src/java/org/apache/cassandra/cql3/Operation.java
+++ b/src/java/org/apache/cassandra/cql3/Operation.java
@@ -103,12 +103,13 @@ public boolean requiresTimestamp()
* Collects the column specification for the bind variables of this operation.
*
* @param boundNames the list of column specification where to collect the
- * bind variables of this term in.
+ * bind variables of this term in.
+ * @param owner
*/
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
if (t != null)
- t.collectMarkerSpecification(boundNames);
+ t.collectMarkerSpecification(boundNames, owner);
}
/**
diff --git a/src/java/org/apache/cassandra/cql3/Ordering.java b/src/java/org/apache/cassandra/cql3/Ordering.java
index a0d95a89031e..40f9b616e9ae 100644
--- a/src/java/org/apache/cassandra/cql3/Ordering.java
+++ b/src/java/org/apache/cassandra/cql3/Ordering.java
@@ -134,14 +134,14 @@ public Raw(Expression expression, Direction direction)
* Resolves column identifiers against the table schema.
* Binds markers (?) to columns.
*/
- public Ordering bind(TableMetadata table, VariableSpecifications boundNames)
+ public Ordering bind(TableMetadata table, VariableSpecifications boundNames, Object owner)
{
- return new Ordering(expression.bind(table, boundNames), direction);
+ return new Ordering(expression.bind(table, boundNames, owner), direction);
}
public interface Expression
{
- Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames);
+ Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames, Object owner);
}
public static class SingleColumn implements Expression
@@ -154,7 +154,7 @@ public static class SingleColumn implements Expression
}
@Override
- public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames)
+ public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames, Object owner)
{
return new Ordering.SingleColumn(table.getExistingColumn(column), table);
}
@@ -172,11 +172,11 @@ public static class Ann implements Expression
}
@Override
- public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames)
+ public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames, Object owner)
{
ColumnMetadata column = table.getExistingColumn(columnId);
Term value = vectorValue.prepare(table.keyspace, column);
- value.collectMarkerSpecification(boundNames);
+ value.collectMarkerSpecification(boundNames, owner);
return new Ordering.Ann(column, table, value);
}
}
diff --git a/src/java/org/apache/cassandra/cql3/QueryOptions.java b/src/java/org/apache/cassandra/cql3/QueryOptions.java
index b7003388af7a..3244497914ff 100644
--- a/src/java/org/apache/cassandra/cql3/QueryOptions.java
+++ b/src/java/org/apache/cassandra/cql3/QueryOptions.java
@@ -116,7 +116,7 @@ public static QueryOptions create(ConsistencyLevel consistency,
values,
ByteArrayUtil.EMPTY_ARRAY_OF_BYTE_ARRAYS,
skipMetadata,
- new SpecificOptions(pageSize, pagingState, serialConsistency, timestamp, keyspace, nowInSeconds),
+ new SpecificOptions(pageSize, pagingState, serialConsistency, timestamp, keyspace, nowInSeconds, false),
version);
}
@@ -264,6 +264,11 @@ public long getNowInSeconds(QueryState state)
return nowInSeconds != UNSET_NOWINSEC ? nowInSeconds : state.getNowInSeconds();
}
+ public boolean isEligibleForArtificialLatency()
+ {
+ return getSpecificOptions().eligibleForArtificialLatency;
+ }
+
/** The keyspace that this query is bound to, or null if not relevant. */
public String getKeyspace() { return getSpecificOptions().keyspace; }
@@ -631,7 +636,7 @@ public List getValues()
// Options that are likely to not be present in most queries
static class SpecificOptions
{
- private static final SpecificOptions DEFAULT = new SpecificOptions(-1, null, null, Long.MIN_VALUE, null, UNSET_NOWINSEC);
+ private static final SpecificOptions DEFAULT = new SpecificOptions(-1, null, null, Long.MIN_VALUE, null, UNSET_NOWINSEC, false);
private final int pageSize;
private final PagingState state;
@@ -639,13 +644,15 @@ static class SpecificOptions
private final long timestamp;
private final String keyspace;
private final long nowInSeconds;
+ private final boolean eligibleForArtificialLatency;
private SpecificOptions(int pageSize,
PagingState state,
ConsistencyLevel serialConsistency,
long timestamp,
String keyspace,
- long nowInSeconds)
+ long nowInSeconds,
+ boolean eligibleForArtificialLatency)
{
this.pageSize = pageSize;
this.state = state;
@@ -653,11 +660,12 @@ private SpecificOptions(int pageSize,
this.timestamp = timestamp;
this.keyspace = keyspace;
this.nowInSeconds = nowInSeconds;
+ this.eligibleForArtificialLatency = eligibleForArtificialLatency;
}
public SpecificOptions withNowInSec(long nowInSec)
{
- return new SpecificOptions(pageSize, state, serialConsistency, timestamp, keyspace, nowInSec);
+ return new SpecificOptions(pageSize, state, serialConsistency, timestamp, keyspace, nowInSec, eligibleForArtificialLatency);
}
}
@@ -674,7 +682,9 @@ enum Flag
TIMESTAMP,
NAMES_FOR_VALUES,
KEYSPACE,
- NOW_IN_SECONDS;
+ NOW_IN_SECONDS,
+ ELIGIBLE_FOR_ARTIFICIAL_LATENCY,
+ ;
private final int mask;
@@ -755,7 +765,8 @@ public QueryOptions decode(ByteBuf body, ProtocolVersion version)
String keyspace = Flag.contains(flags, Flag.KEYSPACE) ? CBUtil.readString(body) : null;
long nowInSeconds = Flag.contains(flags, Flag.NOW_IN_SECONDS) ? CassandraUInt.toLong(body.readInt())
: UNSET_NOWINSEC;
- options = new SpecificOptions(pageSize, pagingState, serialConsistency, timestamp, keyspace, nowInSeconds);
+ boolean eligibleForArtificialLatency = Flag.contains(flags, Flag.ELIGIBLE_FOR_ARTIFICIAL_LATENCY);
+ options = new SpecificOptions(pageSize, pagingState, serialConsistency, timestamp, keyspace, nowInSeconds, eligibleForArtificialLatency);
}
DefaultQueryOptions opts = new DefaultQueryOptions(consistency, null, values, skipMetadata, options, version);
diff --git a/src/java/org/apache/cassandra/cql3/Relation.java b/src/java/org/apache/cassandra/cql3/Relation.java
index 5de031102ce8..3bbcd2a30743 100644
--- a/src/java/org/apache/cassandra/cql3/Relation.java
+++ b/src/java/org/apache/cassandra/cql3/Relation.java
@@ -191,7 +191,7 @@ public boolean onToken()
* @return the Restriction corresponding to this Relation
* @throws InvalidRequestException if this Relation is not valid
*/
- public SingleRestriction toRestriction(TableMetadata table, VariableSpecifications boundNames, boolean allowFiltering)
+ public SingleRestriction toRestriction(TableMetadata table, VariableSpecifications boundNames, Object owner, boolean allowFiltering)
{
ColumnsExpression columnsExpression = rawExpressions.prepare(table);
@@ -210,7 +210,7 @@ public SingleRestriction toRestriction(TableMetadata table, VariableSpecificatio
if (column.isClusteringColumn() && baseType.isCollection() && !column.type.isMultiCell())
throw invalidRequest(FROZEN_MAP_ENTRY_PREDICATES_NOT_SUPPORTED, column.name);
- columnsExpression.collectMarkerSpecification(boundNames);
+ columnsExpression.collectMarkerSpecification(boundNames, owner);
}
operator.validateFor(columnsExpression);
@@ -220,7 +220,7 @@ public SingleRestriction toRestriction(TableMetadata table, VariableSpecificatio
receiver = ((CollectionType>) receiver.type).makeCollectionReceiver(receiver, operator.appliesToMapKeys());
Terms terms = rawTerms.prepare(table.keyspace, receiver);
- terms.collectMarkerSpecification(boundNames);
+ terms.collectMarkerSpecification(boundNames, owner);
// An IN restriction with only one element is the same as an EQ restriction
if (operator.isIN() && terms.containsSingleTerm())
diff --git a/src/java/org/apache/cassandra/cql3/VariableSpecifications.java b/src/java/org/apache/cassandra/cql3/VariableSpecifications.java
index 504859cac447..8c797e19dec9 100644
--- a/src/java/org/apache/cassandra/cql3/VariableSpecifications.java
+++ b/src/java/org/apache/cassandra/cql3/VariableSpecifications.java
@@ -21,6 +21,8 @@
import java.util.Collections;
import java.util.List;
+import javax.annotation.Nullable;
+
import com.google.common.collect.ImmutableList;
import org.apache.cassandra.schema.ColumnMetadata;
@@ -32,6 +34,8 @@ public class VariableSpecifications
private final List specs;
private volatile ImmutableList immutableSpecs;
private final ColumnMetadata[] targetColumns;
+ // TODO (desired): this is an ugly way to figure out which sub statement we're using as a key in transactions, but path of least resistance...
+ private @Nullable Object[] targetOwners;
public VariableSpecifications(List variableNames)
{
@@ -40,6 +44,12 @@ public VariableSpecifications(List variableNames)
this.targetColumns = new ColumnMetadata[variableNames.size()];
}
+ public void setSaveTargetOwners(boolean saveTargetOwners)
+ {
+ this.targetOwners = saveTargetOwners ? new Object[variableNames.size()] : null;
+ }
+
+
/**
* Returns an empty instance of VariableSpecifications.
* @return an empty instance of VariableSpecifications
@@ -77,14 +87,14 @@ public ImmutableList getImmutableBindVariables()
*
* Callers of this method should ensure that all statements operate on the same table.
*/
- public short[] getPartitionKeyBindVariableIndexes(TableMetadata metadata)
+ public short[] getPartitionKeyBindVariableIndexes(TableMetadata metadata, Object targetOwner)
{
short[] partitionKeyPositions = new short[metadata.partitionKeyColumns().size()];
boolean[] set = new boolean[partitionKeyPositions.length];
for (int i = 0; i < targetColumns.length; i++)
{
ColumnMetadata targetColumn = targetColumns[i];
- if (targetColumn != null && targetColumn.isPartitionKey())
+ if (targetColumn != null && targetColumn.isPartitionKey() && (targetOwners == null || (targetOwner != null && targetOwners[i] == targetOwner)))
{
assert targetColumn.ksName.equals(metadata.keyspace) && targetColumn.cfName.equals(metadata.name);
partitionKeyPositions[targetColumn.position()] = (short) i;
@@ -99,12 +109,15 @@ public short[] getPartitionKeyBindVariableIndexes(TableMetadata metadata)
return partitionKeyPositions;
}
- public void add(int bindIndex, ColumnSpecification spec)
+ public void add(int bindIndex, ColumnSpecification spec, Object owner)
{
assert immutableSpecs == null : "bind variable specs cannot be modified once we started to use them";
if (spec instanceof ColumnMetadata)
targetColumns[bindIndex] = (ColumnMetadata) spec;
+ if (targetOwners != null)
+ targetOwners[bindIndex] = owner;
+
ColumnIdentifier bindMarkerName = variableNames.get(bindIndex);
// Use the user name, if there is one
if (bindMarkerName != null)
diff --git a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
index ae8de72ddf14..892cdbaa91d7 100644
--- a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
+++ b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
@@ -104,10 +104,10 @@ public void addFunctionsTo(List functions)
* @param boundNames the list of column specification where to collect the
* bind variables of this term in.
*/
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
- columnsExpression.collectMarkerSpecification(boundNames);
- values.collectMarkerSpecification(boundNames);
+ columnsExpression.collectMarkerSpecification(boundNames, owner);
+ values.collectMarkerSpecification(boundNames, owner);
}
public ColumnCondition.Bound bind(QueryOptions options)
diff --git a/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java b/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java
index 2603680f9032..fab0402e6244 100644
--- a/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java
+++ b/src/java/org/apache/cassandra/cql3/functions/FunctionCall.java
@@ -61,10 +61,10 @@ public void addFunctionsTo(List functions)
fun.addFunctionsTo(functions);
}
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
for (Term t : terms)
- t.collectMarkerSpecification(boundNames);
+ t.collectMarkerSpecification(boundNames, owner);
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
index 0f703e53c4f4..59935afdd12b 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
@@ -45,11 +45,11 @@ public CustomIndexExpression(QualifiedName targetIndex, Term.Raw value)
this.valueRaw = value;
}
- public void prepareValue(TableMetadata table, AbstractType> expressionType, VariableSpecifications boundNames)
+ public void prepareValue(TableMetadata table, AbstractType> expressionType, VariableSpecifications boundNames, Object owner)
{
ColumnSpecification spec = new ColumnSpecification(table.keyspace, table.keyspace, valueColId, expressionType);
value = valueRaw.prepare(table.keyspace, spec);
- value.collectMarkerSpecification(boundNames);
+ value.collectMarkerSpecification(boundNames, owner);
}
public void addToRowFilter(RowFilter filter, TableMetadata table, QueryOptions options)
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
index 4190bf28d020..80ab46576b71 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
@@ -175,11 +175,12 @@ public StatementRestrictions(ClientState state,
WhereClause whereClause,
VariableSpecifications boundNames,
List orderings,
+ Object owner,
boolean selectsOnlyStaticColumns,
boolean allowFiltering,
boolean forView)
{
- this(state, type, table, indexHints, whereClause, boundNames, orderings, selectsOnlyStaticColumns, type.allowUseOfSecondaryIndices(), allowFiltering, forView);
+ this(state, type, table, indexHints, whereClause, boundNames, orderings, owner, selectsOnlyStaticColumns, type.allowUseOfSecondaryIndices(), allowFiltering, forView);
}
/*
@@ -193,6 +194,7 @@ public StatementRestrictions(ClientState state,
WhereClause whereClause,
VariableSpecifications boundNames,
List orderings,
+ Object owner,
boolean selectsOnlyStaticColumns,
boolean allowUseOfSecondaryIndices,
boolean allowFiltering,
@@ -227,11 +229,11 @@ public StatementRestrictions(ClientState state,
if (!forView)
throw new InvalidRequestException("Unsupported restriction: " + relation);
- this.notNullColumns.addAll(relation.toRestriction(table, boundNames, allowFiltering).columns());
+ this.notNullColumns.addAll(relation.toRestriction(table, boundNames, owner, allowFiltering).columns());
}
else if (operator.requiresIndexing())
{
- Restriction restriction = relation.toRestriction(table, boundNames, allowFiltering);
+ Restriction restriction = relation.toRestriction(table, boundNames, owner, allowFiltering);
if (!type.allowUseOfSecondaryIndices() || !restriction.hasSupportingIndex(indexRegistry, indexHints))
throw invalidRequest("%s restriction is only supported on properly " +
@@ -241,7 +243,7 @@ else if (operator.requiresIndexing())
}
else
{
- addRestriction(relation.toRestriction(table, boundNames, allowFiltering), indexRegistry, indexHints);
+ addRestriction(relation.toRestriction(table, boundNames, owner, allowFiltering), indexRegistry, indexHints);
}
}
@@ -756,7 +758,7 @@ private void processCustomIndexExpressions(List expressio
if (expressionType == null)
throw IndexRestrictions.customExpressionNotSupported(expression.targetIndex);
- expression.prepareValue(table, expressionType, boundNames);
+ expression.prepareValue(table, expressionType, boundNames, table);
filterRestrictions.add(expression);
}
diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java
index 301a6306b433..6eac5184822f 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java
@@ -222,7 +222,7 @@ public Selector.Factory newSelectorFactory(TableMetadata table, AbstractType>
// something a lot more helpful and in practice user can bind those markers by position or, even better,
// use bind markers.
Term term = rawTerm.prepare(table.keyspace, new ColumnSpecification(table.keyspace, table.name, bindMarkerNameInSelection, type));
- term.collectMarkerSpecification(boundNames);
+ term.collectMarkerSpecification(boundNames, table);
return TermSelector.newFactory(rawTerm.getText(), term, type);
}
@@ -1481,7 +1481,7 @@ public Selector.Factory newSelectorFactory(TableMetadata cfm, AbstractType> ex
ColumnSpecification boundSpec = specForElementOrSlice(selected, receiver, ((CollectionType) type).kind, "Element");
Term elt = element.prepare(cfm.keyspace, boundSpec);
- elt.collectMarkerSpecification(boundNames);
+ elt.collectMarkerSpecification(boundNames, cfm);
return ElementsSelector.newElementFactory(toString(), factory, (CollectionType)type, elt);
}
@@ -1571,8 +1571,8 @@ public Selector.Factory newSelectorFactory(TableMetadata cfm, AbstractType> ex
// The equivalent of doing this when preparing values would be to use UNSET.
Term f = from == null ? Constants.UNSET_VALUE : from.prepare(cfm.keyspace, boundSpec);
Term t = to == null ? Constants.UNSET_VALUE : to.prepare(cfm.keyspace, boundSpec);
- f.collectMarkerSpecification(boundNames);
- t.collectMarkerSpecification(boundNames);
+ f.collectMarkerSpecification(boundNames, cfm);
+ t.collectMarkerSpecification(boundNames, cfm);
return ElementsSelector.newSliceFactory(toString(), factory, (CollectionType)type, f, t);
}
diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
index 302d923436c5..1785fbadcfc6 100644
--- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
@@ -222,7 +222,7 @@ public short[] getPartitionKeyBindVariableIndexes()
// multiple tables, we won't send partition key bind indexes.
return (affectsMultipleTables || statements.isEmpty())
? null
- : bindVariables.getPartitionKeyBindVariableIndexes(statements.get(0).metadata());
+ : bindVariables.getPartitionKeyBindVariableIndexes(statements.get(0).metadata(), statements.get(0).attrs);
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
index ec5a52497ff3..157f1f7d8a7e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DeleteStatement.java
@@ -176,11 +176,11 @@ protected ModificationStatement prepareInternal(ClientState state,
checkFalse(def.isPrimaryKeyColumn(), "Invalid identifier %s for deletion (should not be a PRIMARY KEY part)", def.name);
Operation op = deletion.prepare(metadata.keyspace, def, metadata);
- op.collectMarkerSpecification(bindVariables);
+ op.collectMarkerSpecification(bindVariables, attrs);
operations.add(op, metadata);
}
- StatementRestrictions restrictions = newRestrictions(state, metadata, bindVariables, operations, whereClause, conditions);
+ StatementRestrictions restrictions = newRestrictions(state, metadata, bindVariables, operations, whereClause, conditions, attrs);
DeleteStatement stmt = new DeleteStatement(bindVariables,
metadata,
diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index ae2bcaacaec5..7a6235b5344e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
@@ -282,7 +282,7 @@ public ImmutableList getBindVariables()
@Override
public short[] getPartitionKeyBindVariableIndexes()
{
- return bindVariables.getPartitionKeyBindVariableIndexes(metadata);
+ return bindVariables.getPartitionKeyBindVariableIndexes(metadata, attrs);
}
@Override
@@ -1328,7 +1328,7 @@ private ColumnConditions prepareColumnConditions(TableMetadata metadata, Variabl
for (ColumnCondition.Raw rawCondition : conditions)
{
ColumnCondition condition = rawCondition.prepare(metadata);
- condition.collectMarkerSpecification(bindVariables);
+ condition.collectMarkerSpecification(bindVariables, attrs);
builder.add(condition);
}
@@ -1356,13 +1356,14 @@ protected StatementRestrictions newRestrictions(ClientState state,
VariableSpecifications boundNames,
Operations operations,
WhereClause where,
- Conditions conditions)
+ Conditions conditions,
+ Object owner)
{
if (where.containsCustomExpressions())
throw new InvalidRequestException(CUSTOM_EXPRESSIONS_NOT_ALLOWED);
boolean applyOnlyToStaticColumns = appliesOnlyToStaticColumns(operations, conditions);
- return new StatementRestrictions(state, type, metadata, IndexHints.NONE, where, boundNames, Collections.emptyList(), applyOnlyToStaticColumns, false, false);
+ return new StatementRestrictions(state, type, metadata, IndexHints.NONE, where, boundNames, Collections.emptyList(), owner, applyOnlyToStaticColumns, false, false);
}
public List getConditions()
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index 735c2627a453..a32dc8182d05 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -241,7 +241,7 @@ public ImmutableList getBindVariables()
@Override
public short[] getPartitionKeyBindVariableIndexes()
{
- return bindVariables.getPartitionKeyBindVariableIndexes(table);
+ return bindVariables.getPartitionKeyBindVariableIndexes(table, table);
}
@Override
@@ -1468,7 +1468,7 @@ private Map getOrderingColumns(List ordering
private List getOrderings(TableMetadata table)
{
return parameters.orderings.stream()
- .map(o -> o.bind(table, bindVariables))
+ .map(o -> o.bind(table, bindVariables, table))
.collect(Collectors.toList());
}
@@ -1498,6 +1498,7 @@ private StatementRestrictions prepareRestrictions(ClientState state,
whereClause,
boundNames,
orderings,
+ metadata,
selectsOnlyStaticColumns,
parameters.allowFiltering || !requiresAllowFilteringIfNotSpecified(metadata, true),
forView);
@@ -1511,7 +1512,7 @@ private Term prepareLimit(VariableSpecifications boundNames, Term.Raw limit,
return null;
Term prepLimit = limit.prepare(keyspace, limitReceiver);
- prepLimit.collectMarkerSpecification(boundNames);
+ prepLimit.collectMarkerSpecification(boundNames, null);
return prepLimit;
}
diff --git a/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java b/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java
index 50b874e4fd87..5d50e0e48675 100644
--- a/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/TransactionStatement.java
@@ -204,6 +204,25 @@ public ImmutableList getBindVariables()
return bindVariables.getImmutableBindVariables();
}
+ @Override
+ public short[] getPartitionKeyBindVariableIndexes()
+ {
+ if (returningSelect != null)
+ {
+ short[] result = returningSelect.select.getPartitionKeyBindVariableIndexes();
+ if (result != null)
+ return result;
+ }
+
+ for (ModificationStatement stmt : updates)
+ {
+ short[] result = stmt.getPartitionKeyBindVariableIndexes();
+ if (result != null)
+ return result;
+ }
+ return null;
+ }
+
@Override
public void authorize(ClientState state)
{
@@ -733,6 +752,7 @@ public CQLStatement prepare(ClientState state)
if (select != null || returning != null)
checkTrue(select != null ^ returning != null, "Cannot specify both a full SELECT and a SELECT w/ LET references.");
+ bindVariables.setSaveTargetOwners(true);
List preparedAssignments = new ArrayList<>(assignments.size());
Map refSources = new HashMap<>();
Set selectNames = new HashSet<>();
diff --git a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
index 28ed1907cca4..f0fe6ce20f00 100644
--- a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
@@ -220,7 +220,7 @@ else if (value instanceof ReferenceValue.Raw)
else
{
Operation operation = new Operation.SetValue(value).prepare(metadata, def, !conditions.isEmpty());
- operation.collectMarkerSpecification(bindVariables);
+ operation.collectMarkerSpecification(bindVariables, attrs);
operations.add(operation, metadata);
}
}
@@ -234,6 +234,7 @@ else if (value instanceof ReferenceValue.Raw)
whereClause.build(),
bindVariables,
Collections.emptyList(),
+ attrs,
applyOnlyToStaticColumns,
false,
false);
@@ -295,7 +296,7 @@ protected ModificationStatement prepareInternal(ClientState state,
else
{
Operation operation = new Operation.SetValue(raw).prepare(metadata, def, !conditions.isEmpty());
- operation.collectMarkerSpecification(bindVariables);
+ operation.collectMarkerSpecification(bindVariables, attrs);
operations.add(operation, metadata);
}
}
@@ -309,6 +310,7 @@ protected ModificationStatement prepareInternal(ClientState state,
whereClause.build(),
bindVariables,
Collections.emptyList(),
+ attrs,
applyOnlyToStaticColumns,
false,
false);
@@ -417,7 +419,7 @@ protected ModificationStatement prepareInternal(ClientState state,
ColumnMetadata def = metadata.getExistingColumn(entry.left);
checkFalse(def.isPrimaryKeyColumn(), UPDATING_PRIMARY_KEY_MESSAGE, def.name);
Operation operation = entry.right.prepare(metadata, def, !conditions.isEmpty() || isForTxn);
- operation.collectMarkerSpecification(bindVariables);
+ operation.collectMarkerSpecification(bindVariables, attrs);
operations.add(operation, metadata);
}
@@ -435,7 +437,8 @@ protected ModificationStatement prepareInternal(ClientState state,
bindVariables,
operations,
whereClause,
- conditions);
+ conditions,
+ attrs);
return new UpdateStatement(type,
bindVariables,
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
index 3b204b236cb6..d0b219ca62cc 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
@@ -310,6 +310,7 @@ public Keyspaces apply(ClusterMetadata metadata)
whereClause,
VariableSpecifications.empty(),
Collections.emptyList(),
+ null,
false,
false,
true,
diff --git a/src/java/org/apache/cassandra/cql3/terms/InMarker.java b/src/java/org/apache/cassandra/cql3/terms/InMarker.java
index 8059f741b177..857d9e0018f9 100644
--- a/src/java/org/apache/cassandra/cql3/terms/InMarker.java
+++ b/src/java/org/apache/cassandra/cql3/terms/InMarker.java
@@ -54,9 +54,9 @@ private InMarker(int bindIndex, ColumnSpecification receiver)
public void addFunctionsTo(List functions) {}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
- boundNames.add(bindIndex, receiver);
+ boundNames.add(bindIndex, receiver, owner);
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/terms/Lists.java b/src/java/org/apache/cassandra/cql3/terms/Lists.java
index b0f319e6f213..501306df7200 100644
--- a/src/java/org/apache/cassandra/cql3/terms/Lists.java
+++ b/src/java/org/apache/cassandra/cql3/terms/Lists.java
@@ -343,10 +343,10 @@ public boolean requiresRead()
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
- super.collectMarkerSpecification(boundNames);
- idx.collectMarkerSpecification(boundNames);
+ super.collectMarkerSpecification(boundNames, owner);
+ idx.collectMarkerSpecification(boundNames, owner);
}
public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
diff --git a/src/java/org/apache/cassandra/cql3/terms/Maps.java b/src/java/org/apache/cassandra/cql3/terms/Maps.java
index b21d84bf81ed..0b2af9724583 100644
--- a/src/java/org/apache/cassandra/cql3/terms/Maps.java
+++ b/src/java/org/apache/cassandra/cql3/terms/Maps.java
@@ -276,10 +276,10 @@ public SetterByKey(ColumnMetadata column, Term k, Term t)
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
- super.collectMarkerSpecification(boundNames);
- k.collectMarkerSpecification(boundNames);
+ super.collectMarkerSpecification(boundNames, owner);
+ k.collectMarkerSpecification(boundNames, owner);
}
public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
diff --git a/src/java/org/apache/cassandra/cql3/terms/Marker.java b/src/java/org/apache/cassandra/cql3/terms/Marker.java
index d36e81bdfd5f..12c95d6619a8 100644
--- a/src/java/org/apache/cassandra/cql3/terms/Marker.java
+++ b/src/java/org/apache/cassandra/cql3/terms/Marker.java
@@ -55,9 +55,9 @@ private Marker(int bindIndex, ColumnSpecification receiver)
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
- boundNames.add(bindIndex, receiver);
+ boundNames.add(bindIndex, receiver, owner);
}
@Override
diff --git a/src/java/org/apache/cassandra/cql3/terms/MultiElements.java b/src/java/org/apache/cassandra/cql3/terms/MultiElements.java
index 40d4a20e781e..8fc8833b3078 100644
--- a/src/java/org/apache/cassandra/cql3/terms/MultiElements.java
+++ b/src/java/org/apache/cassandra/cql3/terms/MultiElements.java
@@ -116,12 +116,12 @@ public DelayedValue(MultiElementType> type, List elements)
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
if (type.supportsElementBindMarkers())
{
for (int i = 0, m = elements.size(); i < m; i++)
- elements.get(i).collectMarkerSpecification(boundNames);
+ elements.get(i).collectMarkerSpecification(boundNames, owner);
}
}
diff --git a/src/java/org/apache/cassandra/cql3/terms/Term.java b/src/java/org/apache/cassandra/cql3/terms/Term.java
index 0fc267392264..64912fe0cdf4 100644
--- a/src/java/org/apache/cassandra/cql3/terms/Term.java
+++ b/src/java/org/apache/cassandra/cql3/terms/Term.java
@@ -68,9 +68,10 @@ public int size()
* This is obviously a no-op if the term is Terminal.
*
* @param boundNames the variables specification where to collect the
- * bind variables of this term in.
+ * bind variables of this term in.
+ * @param owner
*/
- void collectMarkerSpecification(VariableSpecifications boundNames);
+ void collectMarkerSpecification(VariableSpecifications boundNames, Object owner);
/**
* Bind the values in this term to the values contained in the {@code options}.
@@ -246,7 +247,7 @@ public boolean containsBindMarker()
abstract class Terminal implements Term
{
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames) {}
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner) {}
@Override
public Terminal bind(QueryOptions options) { return this; }
diff --git a/src/java/org/apache/cassandra/cql3/terms/Terms.java b/src/java/org/apache/cassandra/cql3/terms/Terms.java
index 10f0c5d137e2..9936089d9b6f 100644
--- a/src/java/org/apache/cassandra/cql3/terms/Terms.java
+++ b/src/java/org/apache/cassandra/cql3/terms/Terms.java
@@ -95,9 +95,10 @@ public String toString()
* This is obviously a no-op if the terms are Terminals.
*
* @param boundNames the variables specification where to collect the
- * bind variables of the terms in.
+ * bind variables of the terms in.
+ * @param owner
*/
- void collectMarkerSpecification(VariableSpecifications boundNames);
+ void collectMarkerSpecification(VariableSpecifications boundNames, Object owner);
/**
* Bind the values in these terms to the values contained in {@code options}.
@@ -453,7 +454,7 @@ public boolean containsSingleTerm()
}
};
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames) {}
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner) {}
@Override
public final Terminals bind(QueryOptions options)
@@ -621,9 +622,9 @@ public void addFunctionsTo(List functions)
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
- term.collectMarkerSpecification(boundNames);
+ term.collectMarkerSpecification(boundNames, owner);
}
@Override
@@ -687,12 +688,12 @@ public void addFunctionsTo(List functions)
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
for (int i = 0, m = terms.size(); i < m; i++)
{
Term term = terms.get(i);
- term.collectMarkerSpecification(boundNames);
+ term.collectMarkerSpecification(boundNames, owner);
}
}
diff --git a/src/java/org/apache/cassandra/cql3/transactions/ConditionStatement.java b/src/java/org/apache/cassandra/cql3/transactions/ConditionStatement.java
index 46747a168e20..7f19d834d934 100644
--- a/src/java/org/apache/cassandra/cql3/transactions/ConditionStatement.java
+++ b/src/java/org/apache/cassandra/cql3/transactions/ConditionStatement.java
@@ -93,7 +93,7 @@ public ConditionStatement prepare(String keyspace, VariableSpecifications bindVa
{
// In the IS NULL/IS NOT NULL case, the reference will always be on the LHS
RowDataReference reference = ((RowDataReference.Raw) lhs).prepareAsReceiver();
- reference.collectMarkerSpecification(bindVariables);
+ reference.collectMarkerSpecification(bindVariables, null);
return new ConditionStatement(reference, kind, null, false);
}
@@ -124,8 +124,8 @@ else if (rhs instanceof RowDataReference.Raw)
throw new IllegalStateException("Either the left-hand or right-hand side must be a reference!");
}
- reference.collectMarkerSpecification(bindVariables);
- value.collectMarkerSpecification(bindVariables);
+ reference.collectMarkerSpecification(bindVariables, null);
+ value.collectMarkerSpecification(bindVariables, null);
return new ConditionStatement(reference, kind, value, reversed);
}
}
diff --git a/src/java/org/apache/cassandra/cql3/transactions/RowDataReference.java b/src/java/org/apache/cassandra/cql3/transactions/RowDataReference.java
index 161e1d570611..21d6c442ebc5 100644
--- a/src/java/org/apache/cassandra/cql3/transactions/RowDataReference.java
+++ b/src/java/org/apache/cassandra/cql3/transactions/RowDataReference.java
@@ -77,10 +77,10 @@ public RowDataReference(String selectName, int txnDataName, ColumnMetadata colum
}
@Override
- public void collectMarkerSpecification(VariableSpecifications boundNames)
+ public void collectMarkerSpecification(VariableSpecifications boundNames, Object owner)
{
if (elementPath != null)
- elementPath.collectMarkerSpecification(boundNames);
+ elementPath.collectMarkerSpecification(boundNames, owner);
}
@Override
diff --git a/src/java/org/apache/cassandra/db/ConsistencyLevel.java b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
index 2a8c30d4c93f..086867933b49 100644
--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
@@ -45,7 +45,12 @@ public enum ConsistencyLevel
SERIAL (8),
LOCAL_SERIAL(9, true),
LOCAL_ONE (10, true),
- NODE_LOCAL (11, true);
+ NODE_LOCAL (11, true),
+ UNSAFE_DELAY_QUORUM(99, false),
+ UNSAFE_DELAY_SERIAL(100, false),
+ UNSAFE_DELAY_LOCAL_QUORUM(101, true),
+ UNSAFE_DELAY_LOCAL_SERIAL(102, true);
+
// Used by the binary protocol
public final int code;
@@ -143,12 +148,16 @@ public int blockFor(AbstractReplicationStrategy replicationStrategy)
return 2;
case THREE:
return 3;
+ case UNSAFE_DELAY_QUORUM:
case QUORUM:
+ case UNSAFE_DELAY_SERIAL:
case SERIAL:
return quorumFor(replicationStrategy);
case ALL:
return replicationStrategy.getReplicationFactor().allReplicas;
+ case UNSAFE_DELAY_LOCAL_QUORUM:
case LOCAL_QUORUM:
+ case UNSAFE_DELAY_LOCAL_SERIAL:
case LOCAL_SERIAL:
return localQuorumForOurDc(replicationStrategy);
case EACH_QUORUM:
@@ -178,13 +187,16 @@ public int blockForWrite(AbstractReplicationStrategy replicationStrategy, Endpoi
{
case ANY:
break;
- case LOCAL_ONE: case LOCAL_QUORUM: case LOCAL_SERIAL:
+ case LOCAL_ONE:
+ case UNSAFE_DELAY_LOCAL_QUORUM: case LOCAL_QUORUM:
+ case UNSAFE_DELAY_LOCAL_SERIAL: case LOCAL_SERIAL:
// we will only count local replicas towards our response count, as these queries only care about local guarantees
blockFor += pending.count(InOurDc.replicas());
break;
case ONE: case TWO: case THREE:
- case QUORUM: case EACH_QUORUM:
- case SERIAL:
+ case UNSAFE_DELAY_QUORUM: case QUORUM:
+ case UNSAFE_DELAY_SERIAL: case SERIAL:
+ case EACH_QUORUM:
case ALL:
blockFor += pending.size();
}
@@ -219,7 +231,9 @@ public void validateForWrite() throws InvalidRequestException
switch (this)
{
case SERIAL:
+ case UNSAFE_DELAY_SERIAL:
case LOCAL_SERIAL:
+ case UNSAFE_DELAY_LOCAL_SERIAL:
throw new InvalidRequestException("You must use conditional updates for serializable writes");
}
}
@@ -233,7 +247,9 @@ public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy
requireNetworkTopologyStrategy(replicationStrategy);
break;
case SERIAL:
+ case UNSAFE_DELAY_SERIAL:
case LOCAL_SERIAL:
+ case UNSAFE_DELAY_LOCAL_SERIAL:
throw new InvalidRequestException(this + " is not supported as conditional update commit consistency. Use ANY if you mean \"make sure it is accepted but I don't care how many replicas commit it for non-SERIAL reads\"");
}
}
@@ -246,7 +262,16 @@ public void validateForCas() throws InvalidRequestException
public boolean isSerialConsistency()
{
- return this == SERIAL || this == LOCAL_SERIAL;
+ switch (this)
+ {
+ case SERIAL:
+ case UNSAFE_DELAY_SERIAL:
+ case LOCAL_SERIAL:
+ case UNSAFE_DELAY_LOCAL_SERIAL:
+ return true;
+ default:
+ return false;
+ }
}
public void validateCounterForWrite(TableMetadata metadata) throws InvalidRequestException
diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index d161fb315d2c..e88a6d1c2745 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java
@@ -925,12 +925,12 @@ private UnfilteredPartitionIterator maybeSlowDownForTesting(UnfilteredPartitionI
*/
public Message createMessage(boolean trackRepairedData, Dispatcher.RequestTime requestTime)
{
- List flags = new ArrayList<>(3);
- flags.add(MessageFlag.CALL_BACK_ON_FAILURE);
+ int flags = MessageFlag.CALL_BACK_ON_FAILURE.addTo(0);
+
if (trackWarnings)
- flags.add(MessageFlag.TRACK_WARNINGS);
+ flags = MessageFlag.TRACK_WARNINGS.addTo(flags);
if (trackRepairedData)
- flags.add(MessageFlag.TRACK_REPAIRED_DATA);
+ flags = MessageFlag.TRACK_REPAIRED_DATA.addTo(flags);
return Message.outWithFlags(verb(),
this,
diff --git a/src/java/org/apache/cassandra/db/virtual/AccordDebugKeyspace.java b/src/java/org/apache/cassandra/db/virtual/AccordDebugKeyspace.java
index 89b801d0e3b3..2e114186f193 100644
--- a/src/java/org/apache/cassandra/db/virtual/AccordDebugKeyspace.java
+++ b/src/java/org/apache/cassandra/db/virtual/AccordDebugKeyspace.java
@@ -74,7 +74,6 @@
import accord.local.MaxConflicts;
import accord.local.Node;
import accord.local.PreLoadContext;
-import accord.local.RejectBefore;
import accord.local.SafeCommand;
import accord.local.SafeCommandStore;
import accord.local.StoreParticipants;
@@ -107,7 +106,7 @@
import accord.utils.async.AsyncResult;
import accord.utils.async.AsyncResults;
-import org.apache.cassandra.config.AccordSpec.JournalSpec.ReplayMode;
+import org.apache.cassandra.config.AccordConfig.JournalConfig.ReplayMode;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
import org.apache.cassandra.db.ColumnFamilyStore;
@@ -211,7 +210,6 @@ public class AccordDebugKeyspace extends VirtualKeyspace
public static final String NODE_OPS = "node_ops";
public static final String PROGRESS_LOG = "progress_log";
public static final String REDUNDANT_BEFORE = "redundant_before";
- public static final String REJECT_BEFORE = "reject_before";
public static final String TXN = "txn";
public static final String TXN_CACHE = "txn_cache";
public static final String TXN_BLOCKED_BY = "txn_blocked_by";
@@ -252,7 +250,6 @@ private AccordDebugKeyspace()
new NodeOpsTable(),
new ProgressLogTable(),
new RedundantBeforeTable(),
- new RejectBeforeTable(),
new TxnBlockedByTable(),
new TxnTable(),
new TxnCacheTable(),
@@ -819,7 +816,9 @@ private MaxConflictsTable()
" token_start 'TokenUtf8Type',\n" +
" table_id text,\n" +
" token_end 'TokenUtf8Type',\n" +
- " timestamp text,\n" +
+ " any text,\n" +
+ " write text,\n" +
+ " reject text,\n" +
" PRIMARY KEY (command_store_id, token_start)" +
')', Int32Type.instance), FAIL, ASC);
}
@@ -843,7 +842,9 @@ public void collect(PartitionsCollector collector)
.lazyCollect(columns -> {
columns.add("token_end", entry.end(), AccordDebugKeyspace::printToken)
.add("table_id", tableIdStr)
- .add("timestamp", entry, TO_STRING);
+ .add("any", entry.any, TO_STRING)
+ .add("write", entry.write, TO_STRING)
+ .add("reject", entry.reject, TO_STRING);
});
return rows;
}, rows
@@ -1084,47 +1085,6 @@ public void collect(PartitionsCollector collector)
}
}
- public static final class RejectBeforeTable extends AbstractLazyVirtualTable
- {
- private RejectBeforeTable()
- {
- super(parse(VIRTUAL_ACCORD_DEBUG, REJECT_BEFORE,
- "Accord per-CommandStore RejectBefore State",
- "CREATE TABLE %s (\n" +
- " command_store_id int,\n" +
- " token_start 'TokenUtf8Type',\n" +
- " table_id text,\n" +
- " token_end 'TokenUtf8Type',\n" +
- " timestamp text,\n" +
- " PRIMARY KEY (command_store_id, token_start)" +
- ')', UTF8Type.instance), FAIL, ASC);
- }
-
- @Override
- protected void collect(PartitionsCollector collector)
- {
- CommandStores commandStores = AccordService.unsafeInstance().node().commandStores();
- for (CommandStore commandStore : commandStores.all())
- {
- RejectBefore rejectBefore = commandStore.unsafeGetRejectBefore();
- if (rejectBefore == null)
- continue;
-
- collector.partition(commandStore.id()).collect(rows -> {
- TableId tableId = ((AccordCommandStore)commandStore).tableId();
- String tableIdStr = tableId.toString();
- rejectBefore.foldlWithBounds((timestamp, rs, start, end) -> {
- rs.add(printToken(start))
- .lazyCollect(columns -> columns.add("table_id", tableIdStr)
- .add("token_end", end, AccordDebugKeyspace::printToken)
- .add("timestamp", timestamp, AccordDebugKeyspace::toStringOrNull));
- return rs;
- }, rows, ignore -> false);
- });
- }
- }
- }
-
/**
* Usage:
* collect N events (may be more than N messages)
@@ -1259,6 +1219,7 @@ private TxnTracesTable()
" event text,\n" +
" at_micros bigint,\n" +
" command_store_id int,\n" +
+ " remote_node_id int,\n" +
" message text,\n" +
" PRIMARY KEY (txn_id, id_micros, event, at_micros)" +
')', TxnIdUtf8Type.instance), FAIL, UNSORTED, UNSORTED);
@@ -1302,6 +1263,7 @@ public void truncate()
@Override
public void collect(PartitionsCollector collector)
{
+ int nodeId = AccordService.unsafeInstance().nodeId().id;
tracing().forEach(id -> true, (txnId, events) -> {
events.forEach(e -> {
if (e.messages().isEmpty())
@@ -1318,6 +1280,8 @@ public void collect(PartitionsCollector collector)
.eagerCollect(columns -> {
columns.add("command_store_id", m.commandStoreId)
.add("message", m.message);
+ if (m.nodeId >= 0)
+ columns.add("remote_node_id", m.nodeId);
});
});
}
diff --git a/src/java/org/apache/cassandra/journal/Segment.java b/src/java/org/apache/cassandra/journal/Segment.java
index 5d6d65f82f52..0984d9728efa 100644
--- a/src/java/org/apache/cassandra/journal/Segment.java
+++ b/src/java/org/apache/cassandra/journal/Segment.java
@@ -18,6 +18,7 @@
package org.apache.cassandra.journal;
import java.nio.ByteBuffer;
+import java.util.concurrent.RejectedExecutionException;
import accord.utils.Invariants;
@@ -44,8 +45,12 @@ public final void run()
public final void tidy()
{
- if (executor != null) executor.execute(this);
- else onUnreferenced();
+ if (executor != null)
+ {
+ try { executor.execute(this); return; }
+ catch (RejectedExecutionException rje) { /* fallthrough and call directly */ }
+ }
+ onUnreferenced();
}
}
diff --git a/src/java/org/apache/cassandra/locator/ReplicaPlans.java b/src/java/org/apache/cassandra/locator/ReplicaPlans.java
index da52510bf343..61def76d1ab9 100644
--- a/src/java/org/apache/cassandra/locator/ReplicaPlans.java
+++ b/src/java/org/apache/cassandra/locator/ReplicaPlans.java
@@ -110,6 +110,7 @@ public static boolean isSufficientLiveReplicasForRead(Locator locator, AbstractR
return true;
case LOCAL_ONE:
return countInOurDc(liveReplicas).hasAtleast(1, 1);
+ case UNSAFE_DELAY_LOCAL_QUORUM:
case LOCAL_QUORUM:
return countInOurDc(liveReplicas).hasAtleast(localQuorumForOurDc(replicationStrategy), 1);
case EACH_QUORUM:
@@ -157,6 +158,7 @@ static void assureSufficientLiveReplicas(Locator locator, AbstractReplicationStr
throw UnavailableException.create(consistencyLevel, 1, blockForFullReplicas, localLive.allReplicas(), localLive.fullReplicas());
break;
}
+ case UNSAFE_DELAY_LOCAL_QUORUM:
case LOCAL_QUORUM:
{
Replicas.ReplicaCount localLive = countInOurDc(allLive);
@@ -753,7 +755,7 @@ public static ReplicaPlan.ForPaxosWrite forPaxos(ClusterMetadata metadata, Keysp
Replicas.temporaryAssertFull(liveAndDown.all()); // TODO CASSANDRA-14547
- if (consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL)
+ if (consistencyForPaxos.isDatacenterLocal())
{
// TODO: we should cleanup our semantics here, as we're filtering ALL nodes to localDC which is unexpected for ReplicaPlan
// Restrict natural and pending to node in the local DC only
diff --git a/src/java/org/apache/cassandra/metrics/AccordCoordinatorMetrics.java b/src/java/org/apache/cassandra/metrics/AccordCoordinatorMetrics.java
index 376c19f673fc..52598cf2372c 100644
--- a/src/java/org/apache/cassandra/metrics/AccordCoordinatorMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/AccordCoordinatorMetrics.java
@@ -34,6 +34,7 @@
import accord.primitives.Ballot;
import accord.primitives.Deps;
import accord.primitives.TxnId;
+import accord.utils.UnhandledEnum;
import org.apache.cassandra.service.accord.api.AccordTimeService;
import org.apache.cassandra.tracing.Tracing;
@@ -248,10 +249,14 @@ public void onExecuting(TxnId txnId, @Nullable Ballot ballot, Deps deps, @Nullab
{
switch (path)
{
+ default: throw new UnhandledEnum(path);
case EPHEMERAL: metrics.ephemeral.mark(); break;
case FAST: metrics.fastPaths.mark(); break;
case MEDIUM: metrics.mediumPaths.mark(); break;
case SLOW: metrics.slowPaths.mark(); break;
+ case BACKLOG:
+ case RECOVER:
+ break;
}
}
}
diff --git a/src/java/org/apache/cassandra/metrics/AccordExecutorMetrics.java b/src/java/org/apache/cassandra/metrics/AccordExecutorMetrics.java
index a59710f4a08f..3d5132131a8a 100644
--- a/src/java/org/apache/cassandra/metrics/AccordExecutorMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/AccordExecutorMetrics.java
@@ -39,6 +39,7 @@ public class AccordExecutorMetrics
public final ShardedDecayingHistogram elapsedPreparingToRun = HISTOGRAMS.newHistogram(TimeUnit.SECONDS.toNanos(1L));
public final ShardedDecayingHistogram elapsedWaitingToRun = HISTOGRAMS.newHistogram(TimeUnit.SECONDS.toNanos(1L));
public final ShardedDecayingHistogram elapsedRunning = HISTOGRAMS.newHistogram(TimeUnit.SECONDS.toNanos(1L));
+ public final ShardedDecayingHistogram elapsed = HISTOGRAMS.newHistogram(TimeUnit.SECONDS.toNanos(1L));
// number of keys involved
public final ShardedDecayingHistogram keys = HISTOGRAMS.newHistogram(1 << 12);
diff --git a/src/java/org/apache/cassandra/metrics/LogLinearDecayingHistograms.java b/src/java/org/apache/cassandra/metrics/LogLinearDecayingHistograms.java
index 00fdae201f11..823346917ae0 100644
--- a/src/java/org/apache/cassandra/metrics/LogLinearDecayingHistograms.java
+++ b/src/java/org/apache/cassandra/metrics/LogLinearDecayingHistograms.java
@@ -167,6 +167,13 @@ public void add(Buffer buffer, long value)
Invariants.require(buffer.histograms == LogLinearDecayingHistograms.this);
buffer.add(histogramIndex, value);
}
+
+ public void clear()
+ {
+ flush();
+ totalCount = 0;
+ Arrays.fill(buckets, 0);
+ }
}
private static final long ANTI_DECAY_REFRESH_RATE = TimeUnit.SECONDS.toNanos(1L);
diff --git a/src/java/org/apache/cassandra/metrics/LogLinearHistogram.java b/src/java/org/apache/cassandra/metrics/LogLinearHistogram.java
index 72daefa35703..10c727c7c438 100644
--- a/src/java/org/apache/cassandra/metrics/LogLinearHistogram.java
+++ b/src/java/org/apache/cassandra/metrics/LogLinearHistogram.java
@@ -317,4 +317,10 @@ public void updateSnapshot(LogLinearSnapshot snapshot)
snapshot.totalCount += totalCount;
snapshot.cumulative = null;
}
+
+ public void clear()
+ {
+ totalCount = 0;
+ Arrays.fill(buckets, 0);
+ }
}
diff --git a/src/java/org/apache/cassandra/metrics/ShardedDecayingHistograms.java b/src/java/org/apache/cassandra/metrics/ShardedDecayingHistograms.java
index c0a2a54a1726..69575639cd4c 100644
--- a/src/java/org/apache/cassandra/metrics/ShardedDecayingHistograms.java
+++ b/src/java/org/apache/cassandra/metrics/ShardedDecayingHistograms.java
@@ -76,6 +76,41 @@ public LogLinearDecayingHistogram forShard(DecayingHistogramsShard shard)
return shard.histograms.get(histogramIndex);
}
+ public LogLinearSnapshot refresh()
+ {
+ synchronized (ShardedDecayingHistograms.this)
+ {
+ long now = Clock.Global.currentTimeMillis();
+ List snapshot = new ArrayList<>(ShardedDecayingHistograms.this.snapshot);
+ if (snapshot.size() <= histogramIndex)
+ return ShardedDecayingHistograms.this.refresh(now).get(histogramIndex);
+
+ LogLinearSnapshot result = LogLinearSnapshot.emptyForMax(initialMaxValue);
+ snapshot.set(histogramIndex, result);
+ for (DecayingHistogramsShard shard : shards)
+ shard.updateSnapshot(histogramIndex, result, now);
+ ShardedDecayingHistograms.this.snapshot = snapshot;
+ // don't update snapshotAt, since we have only refreshed one histogram
+ return result;
+ }
+ }
+
+ public void clear()
+ {
+ for (DecayingHistogramsShard shard : shards)
+ {
+ shard.lock.lock();
+ try
+ {
+ shard.histograms.get(histogramIndex).clear();
+ }
+ finally
+ {
+ shard.lock.unlock();
+ }
+ }
+ }
+
@Override
public boolean isCumulative()
{
@@ -108,6 +143,19 @@ void updateSnapshot(List update, long at)
}
}
+ private void updateSnapshot(int histogramIndex, LogLinearSnapshot update, long at)
+ {
+ lock.lock();
+ try
+ {
+ histograms.get(histogramIndex).updateSnapshot(update, at);
+ }
+ finally
+ {
+ lock.unlock();
+ }
+ }
+
public LogLinearDecayingHistograms unsafeGetInternal()
{
return histograms;
diff --git a/src/java/org/apache/cassandra/metrics/ShardedHistogram.java b/src/java/org/apache/cassandra/metrics/ShardedHistogram.java
index e11c10c0ca10..650ca7536fe6 100644
--- a/src/java/org/apache/cassandra/metrics/ShardedHistogram.java
+++ b/src/java/org/apache/cassandra/metrics/ShardedHistogram.java
@@ -139,4 +139,20 @@ public boolean isCumulative()
{
return isCumulative;
}
+
+ public void clear()
+ {
+ for (HistogramShard shard : shards)
+ {
+ shard.lock.lock();
+ try
+ {
+ shard.histogram.clear();
+ }
+ finally
+ {
+ shard.lock.unlock();
+ }
+ }
+ }
}
diff --git a/src/java/org/apache/cassandra/net/ArtificialLatency.java b/src/java/org/apache/cassandra/net/ArtificialLatency.java
new file mode 100644
index 000000000000..1c6485497293
--- /dev/null
+++ b/src/java/org/apache/cassandra/net/ArtificialLatency.java
@@ -0,0 +1,402 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.net;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.PriorityQueue;
+import java.util.Set;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLongFieldUpdater;
+import java.util.concurrent.locks.LockSupport;
+import java.util.function.ToLongFunction;
+import java.util.stream.Collector;
+import java.util.stream.Collectors;
+
+import org.agrona.collections.Object2LongHashMap;
+
+import accord.api.Tracing;
+
+import org.apache.cassandra.concurrent.ExecutorLocals;
+import org.apache.cassandra.concurrent.Interruptible;
+import org.apache.cassandra.config.CassandraRelevantProperties;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.membership.Directory;
+import org.apache.cassandra.tcm.membership.Location;
+import org.apache.cassandra.tcm.membership.NodeId;
+import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException;
+
+import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory;
+import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.DAEMON;
+import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.Interrupts.UNSYNCHRONIZED;
+import static org.apache.cassandra.concurrent.InfiniteLoopExecutor.SimulatorSafe.SAFE;
+import static org.apache.cassandra.config.CassandraRelevantProperties.ARTIFICIAL_LATENCY_LIMIT;
+import static org.apache.cassandra.net.MessagingService.instance;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+
+/*
+ * Mechanism to delay the sending of messages to peers
+ */
+public class ArtificialLatency extends ExecutorLocals.Impl
+{
+ private static volatile Set artificialLatencyVerbs;
+ private static volatile boolean artificialLatencyOnlyPermittedConsistencyLevels = true;
+ private static volatile ToLongFunction artificialLatencyNanos;
+ private static String artificialLatencies;
+
+ private static Sink running;
+
+ static
+ {
+ setArtificialLatencyVerbs(CassandraRelevantProperties.ARTIFICIAL_LATENCY_VERBS.getString(""));
+ String latencies = CassandraRelevantProperties.ARTIFICIAL_LATENCIES.getString();
+ String unsafeLatencies = CassandraRelevantProperties.ARTIFICIAL_LATENCIES_UNSAFE.getString();
+ if (latencies != null) setArtificialLatencies(latencies);
+ else if (unsafeLatencies != null) unsafeSetArtificialLatencies(unsafeLatencies);
+ if (artificialLatencyNanos != null && !artificialLatencyVerbs.isEmpty())
+ setEnabled(true);
+ }
+
+ // ensure initialised
+ public static void touch() {}
+
+ public static synchronized boolean isEnabled()
+ {
+ return running != null;
+ }
+
+ public static synchronized void setEnabled(boolean enabled)
+ {
+ if (enabled) start();
+ else stop();
+ }
+
+ public static synchronized void start()
+ {
+ if (running == null)
+ running = Sink.start();
+ }
+
+ public static synchronized void stop()
+ {
+ if (running != null)
+ {
+ running.stop();
+ running = null;
+ }
+ }
+
+ public static boolean isEligibleForArtificialLatency()
+ {
+ return ExecutorLocals.current().eligibleForArtificialLatency;
+ }
+
+ public static void setEligibleForArtificialLatency(boolean eligibleForArtificialLatency)
+ {
+ ExecutorLocals current = ExecutorLocals.current();
+ set(current.traceState, current.clientWarnState, eligibleForArtificialLatency);
+ }
+
+ static class Sink implements OutboundSink.AsyncFilter, Interruptible.Task
+ {
+ static class Delayed implements Comparable
+ {
+ final Message> message;
+ final InetAddressAndPort to;
+ final ConnectionType type;
+ final long deadline;
+ final OutboundSink.Sink sink;
+
+ Delayed(Message> message, InetAddressAndPort to, ConnectionType type, long deadline, OutboundSink.Sink sink)
+ {
+ this.message = message;
+ this.to = to;
+ this.type = type;
+ this.deadline = deadline;
+ this.sink = sink;
+ }
+
+ @Override
+ public int compareTo(Delayed that)
+ {
+ return Long.compare(this.deadline, that.deadline);
+ }
+ }
+
+ volatile boolean isShutdown;
+
+ final ConcurrentLinkedQueue in = new ConcurrentLinkedQueue<>();
+ // messages we have stashed in order to apply an artificial delay
+ // note that this queue is not ordered, so that if the artificial delay is modified
+ // it may not take effect until the difference between the two delays elapses
+ final PriorityQueue out = new PriorityQueue<>();
+ final Interruptible executor = executorFactory().infiniteLoop("ArtificialLatency", this, SAFE, DAEMON, UNSYNCHRONIZED);
+
+ volatile Thread waiting;
+ volatile long waitingUntil;
+ private static final AtomicLongFieldUpdater waitingUntilUpdater = AtomicLongFieldUpdater.newUpdater(Sink.class, "waitingUntil");
+
+ static Sink start()
+ {
+ Sink sink = new Sink();
+ instance().outboundSink.add(sink);
+ return sink;
+ }
+
+ void stop()
+ {
+ isShutdown = true;
+ artificialLatencyNanos = ignore -> 0;
+ instance().outboundSink.remove(this);
+ executor.shutdownNow();
+ try
+ {
+ executor.awaitTermination(1, TimeUnit.DAYS);
+ }
+ catch (InterruptedException e)
+ {
+ throw new UncheckedInterruptedException(e);
+ }
+ }
+
+ @Override
+ public void filter(Message> message, InetAddressAndPort to, ConnectionType type, OutboundSink.Sink next)
+ {
+ if (artificialLatencyOnlyPermittedConsistencyLevels && !message.header.permitsArtificialLatency())
+ {
+ next.accept(message, to, type);
+ return;
+ }
+
+ if (!artificialLatencyVerbs.contains(message.verb()))
+ {
+ next.accept(message, to, type);
+ return;
+ }
+
+ long addNanos = artificialLatencyNanos.applyAsLong(to);
+ if (addNanos <= 0)
+ {
+ next.accept(message, to, type);
+ return;
+ }
+
+ long deadline = nanoTime() + addNanos;
+ Delayed delay = new Delayed(message, to, type, deadline, next);
+ in.add(delay);
+
+ while (true)
+ {
+ long curWaitingUntil = waitingUntil;
+ if (deadline >= curWaitingUntil)
+ break;
+
+ if (waitingUntilUpdater.compareAndSet(this, curWaitingUntil, Long.MIN_VALUE))
+ {
+ Thread thread = waiting;
+ if (thread != null)
+ LockSupport.unpark(thread);
+ }
+ }
+
+ if (isShutdown && in.remove(delay))
+ next.accept(message, to, type);
+ }
+
+ public void run(Interruptible.State state) throws InterruptedException
+ {
+ switch (state)
+ {
+ default: throw new IllegalStateException();
+ case SHUTTING_DOWN:
+ {
+ drainIn();
+ out.forEach(d -> instance().send(d.message, d.to, d.type));
+ return;
+ }
+ case NORMAL:
+ {
+ waiting = Thread.currentThread();
+ while (true)
+ {
+ long deadline;
+ while (true)
+ {
+ drainIn();
+ deadline = out.isEmpty() ? Long.MAX_VALUE : out.peek().deadline;
+ if (waitingUntil == deadline)
+ break;
+ waitingUntil = deadline;
+ }
+
+ long waitNanos = deadline - nanoTime();
+ if (waitNanos <= 0)
+ break;
+ LockSupport.parkNanos(waitNanos);
+ }
+ }
+ case INTERRUPTED:
+ {
+ Delayed delayed;
+ long now = nanoTime();
+ while (null != (delayed = out.peek()) && delayed.deadline <= now)
+ {
+ Tracing tracing = (Tracing)delayed.message.params().get(ParamType.ACCORD_TRACING);
+ if (tracing != null)
+ tracing.trace(null, (delayed.message.verb().isResponse() ? "Reply" : "Request") + " delayed to " + ClusterMetadata.current().directory.peerId(delayed.to));
+ delayed.sink.accept(delayed.message, delayed.to, delayed.type);
+ out.poll();
+ }
+ }
+ }
+ }
+
+ private void drainIn()
+ {
+ for (Delayed delayed = in.poll(); delayed != null ; delayed = in.poll())
+ out.add(delayed);
+ }
+ }
+
+ public static String getArtificialLatencies()
+ {
+ return artificialLatencies;
+ }
+
+ private static long parseNanos(String latency)
+ {
+ if (!latency.endsWith("ms"))
+ throw new IllegalArgumentException("Latency must be specified in terms of milliseconds (with 'ms' suffix)");
+
+ return TimeUnit.MILLISECONDS.toNanos(Long.parseLong(latency.substring(0, latency.length() - 2)));
+ }
+
+ public static void setArtificialLatencies(String latencies)
+ {
+ setArtificialLatencies(latencies, parseNanos(ARTIFICIAL_LATENCY_LIMIT.getString()));
+ }
+
+ public static void unsafeSetArtificialLatencies(String latencies)
+ {
+ setArtificialLatencies(latencies, Long.MAX_VALUE);
+ }
+
+ private static synchronized void setArtificialLatencies(String latencies, long nanoLimit)
+ {
+ if (latencies.indexOf(',') < 0)
+ {
+ long nanos = parseNanos(latencies);
+ if (nanos >= nanoLimit)
+ throw new IllegalArgumentException("Artificial latency limit is " + nanoLimit + "ns; tried to set " + nanos + "ns");
+ artificialLatencyNanos = ignore -> nanos;
+ }
+ else
+ {
+ String[] parse = latencies.split(",");
+ Object2LongHashMap dcLatencies = new Object2LongHashMap<>(-1L);
+ for (int i = 0 ; i < parse.length ; ++i)
+ {
+ String[] subparse = parse[i].split(":");
+ String dc = subparse[0];
+ long nanos = parseNanos(subparse[1]);
+ if (nanos >= nanoLimit)
+ throw new IllegalArgumentException("Artificial latency limit is " + nanoLimit + "ns; tried to set " + nanos + "ns");
+ dcLatencies.put(dc, nanos);
+ }
+ artificialLatencyNanos = addr -> {
+ Directory directory = ClusterMetadata.current().directory;
+ NodeId nodeId = directory.peerId(addr);
+ if (nodeId == null)
+ return 0;
+ Location location = directory.location(nodeId);
+ if (location == null)
+ return 0;
+ return dcLatencies.getOrDefault(location.datacenter, 0L);
+ };
+ }
+ artificialLatencies = latencies;
+ }
+
+ public static String getArtificialLatencyVerbs()
+ {
+ return artificialLatencyVerbs.stream()
+ .map(Verb::toString)
+ .collect(Collectors.joining(","));
+ }
+
+ public static boolean getArtificialLatencyOnlyPermittedConsistencyLevels()
+ {
+ return artificialLatencyOnlyPermittedConsistencyLevels;
+ }
+
+ public static void setArtificialLatencyVerbs(String commaDelimitedVerbs)
+ {
+ if (commaDelimitedVerbs.isEmpty())
+ artificialLatencyVerbs = Collections.emptySet();
+ else
+ artificialLatencyVerbs = Arrays.stream(commaDelimitedVerbs.split(","))
+ .filter(s -> !s.isEmpty())
+ .map(s -> {
+ try
+ {
+ return EnumSet.of(Verb.valueOf(s));
+ }
+ catch (IllegalArgumentException iae)
+ {
+ try
+ {
+ return EnumSet.of(Verb.valueOf(s + "_REQ"), Verb.valueOf(s + "_RSP"));
+ }
+ catch (IllegalArgumentException ignore) {}
+ throw iae;
+ }
+ })
+ .collect(Collector.of(() -> EnumSet.noneOf(Verb.class), Set::addAll, (left, right) -> { left.addAll(right); return left; }));
+
+ }
+
+ public static void setArtificialLatencyOnlyPermittedConsistencyLevels(boolean onlyPermitted)
+ {
+ artificialLatencyOnlyPermittedConsistencyLevels = onlyPermitted;
+ }
+
+ public static String recommendedVerbs()
+ {
+ EnumSet verbs = EnumSet.noneOf(Verb.class);
+ verbs.add(Verb.MUTATION_REQ);
+ verbs.add(Verb.MUTATION_RSP);
+ verbs.add(Verb.READ_REQ);
+ verbs.add(Verb.READ_RSP);
+ verbs.add(Verb.RANGE_REQ);
+ verbs.add(Verb.RANGE_RSP);
+ verbs.add(Verb.READ_REPAIR_REQ);
+ verbs.add(Verb.READ_REPAIR_RSP);
+ verbs.add(Verb.HINT_REQ);
+ verbs.add(Verb.HINT_RSP);
+ for (Verb verb : Verb.values())
+ {
+ if (verb.name().startsWith("ACCORD") || verb.name().startsWith("PAXOS"))
+ verbs.add(verb);
+ }
+ return verbs.stream().map(Verb::toString).collect(Collectors.joining(","));
+ }
+}
diff --git a/src/java/org/apache/cassandra/net/InboundMessageHandler.java b/src/java/org/apache/cassandra/net/InboundMessageHandler.java
index 3f7c7b0e91c8..3e0d40675940 100644
--- a/src/java/org/apache/cassandra/net/InboundMessageHandler.java
+++ b/src/java/org/apache/cassandra/net/InboundMessageHandler.java
@@ -35,6 +35,7 @@
import org.apache.cassandra.net.FrameDecoder.IntactFrame;
import org.apache.cassandra.net.Message.Header;
import org.apache.cassandra.net.ResourceLimits.Limit;
+import org.apache.cassandra.service.accord.debug.AccordRemoteTracing;
import org.apache.cassandra.tcm.ClusterMetadataService;
import org.apache.cassandra.tracing.TraceState;
import org.apache.cassandra.tracing.Tracing;
@@ -422,6 +423,7 @@ private void dispatch(ProcessMessage task)
TraceState state = Tracing.instance.initializeFromMessage(header);
if (state != null) state.trace("{} message received from {}", header.verb, header.from);
+ AccordRemoteTracing.traceOffWire(header);
callbacks.onDispatched(task.size(), header);
header.verb.stage.execute(ExecutorLocals.create(state), task);
diff --git a/src/java/org/apache/cassandra/net/Message.java b/src/java/org/apache/cassandra/net/Message.java
index 98ecca9ef580..41f0281706a8 100644
--- a/src/java/org/apache/cassandra/net/Message.java
+++ b/src/java/org/apache/cassandra/net/Message.java
@@ -22,7 +22,6 @@
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashMap;
-import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@@ -58,6 +57,7 @@
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static org.apache.cassandra.db.TypeSizes.sizeof;
import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
+import static org.apache.cassandra.net.MessageFlag.ARTIFICIAL_LATENCY;
import static org.apache.cassandra.net.MessagingService.VERSION_40;
import static org.apache.cassandra.net.MessagingService.VERSION_50;
import static org.apache.cassandra.net.MessagingService.VERSION_60;
@@ -147,6 +147,18 @@ public long expiresAtNanos()
return header.expiresAtNanos;
}
+ @Override
+ public boolean hasFlag(MessageFlag flag)
+ {
+ return header.hasFlag(flag);
+ }
+
+ @Override
+ public Map params()
+ {
+ return header.params();
+ }
+
/** For how long the message has lived. */
public long elapsedSinceCreated(TimeUnit units)
{
@@ -216,8 +228,6 @@ public TraceType traceType()
*/
public static Message out(Verb verb, T payload)
{
- assert !verb.isResponse() : verb;
-
return outWithParam(nextId(), verb, payload, null, null);
}
@@ -233,7 +243,6 @@ public static Message out(Verb verb, T payload, long expiresAtNanos)
public static Message out(Verb verb, T payload, boolean isUrgent)
{
- assert !verb.isResponse();
if (isUrgent)
return outWithFlag(verb, payload, MessageFlag.URGENT);
else
@@ -242,32 +251,17 @@ public static Message out(Verb verb, T payload, boolean isUrgent)
public static Message outWithFlag(Verb verb, T payload, MessageFlag flag)
{
- assert !verb.isResponse();
return outWithParam(nextId(), verb, 0, payload, flag.addTo(0), null, null);
}
- public static Message outWithFlags(Verb verb, T payload, MessageFlag flag1, MessageFlag flag2)
+ public static Message outWithFlag(Verb verb, T payload, Dispatcher.RequestTime requestTime, MessageFlag flag)
{
- assert !verb.isResponse();
- return outWithParam(nextId(), verb, 0, payload, flag2.addTo(flag1.addTo(0)), null, null);
+ return outWithFlags(verb, payload, requestTime, flag.addTo(0));
}
- public static Message outWithFlags(Verb verb, T payload, Dispatcher.RequestTime requestTime, List flags)
+ public static Message outWithFlags(Verb verb, T payload, Dispatcher.RequestTime requestTime, int encodedFlags)
{
- assert !verb.isResponse();
- int encodedFlags = 0;
- for (MessageFlag flag : flags)
- encodedFlags = flag.addTo(encodedFlags);
-
- return new Message(new Header(nextId(),
- epochSupplier.get(),
- verb,
- getBroadcastAddressAndPort(),
- requestTime.startedAtNanos(),
- requestTime.computeDeadline(verb.expiresAfterNanos()),
- encodedFlags,
- buildParams(null, null)),
- payload);
+ return outWithParam(nextId(), verb, requestTime.computeDeadline(verb.expiresAfterNanos()), payload, encodedFlags, null, null);
}
@VisibleForTesting
@@ -283,10 +277,11 @@ private static Message outWithParam(long id, Verb verb, long expiresAtNan
private static Message outWithParam(long id, Verb verb, long expiresAtNanos, T payload, int flags, ParamType paramType, Object paramValue)
{
- return withParam(getBroadcastAddressAndPort(), id, verb, expiresAtNanos, payload, flags, paramType, paramValue);
+ assert !verb.isManagedResponse();
+ return withParam(id, verb, expiresAtNanos, payload, flags, paramType, paramValue);
}
- private static Message withParam(InetAddressAndPort from, long id, Verb verb, long expiresAtNanos, T payload, int flags, ParamType paramType, Object paramValue)
+ private static Message withParam(long id, Verb verb, long expiresAtNanos, T payload, int flags, ParamType paramType, Object paramValue)
{
if (payload == null)
throw new IllegalArgumentException();
@@ -294,14 +289,17 @@ private static Message withParam(InetAddressAndPort from, long id, Verb v
long createdAtNanos = approxTime.now();
if (expiresAtNanos == 0)
expiresAtNanos = verb.expiresAtNanos(createdAtNanos);
+ if (ArtificialLatency.isEligibleForArtificialLatency())
+ flags = ARTIFICIAL_LATENCY.addTo(flags);
+ InetAddressAndPort from = getBroadcastAddressAndPort();
return new Message<>(new Header(id, epochSupplier.get(), verb, from, createdAtNanos, expiresAtNanos, flags, buildParams(paramType, paramValue)), payload);
}
public static Message internalResponse(Verb verb, T payload)
{
- assert verb.isResponse();
- return outWithParam(0, verb, payload, null, null);
+ assert verb.isManagedResponse();
+ return withParam(0, verb, 0, payload, 0, null, null);
}
/**
@@ -311,7 +309,7 @@ public static Message internalResponse(Verb verb, T payload)
@VisibleForTesting
public static Message remoteResponse(InetAddressAndPort from, Verb verb, T payload)
{
- assert verb.isResponse();
+ assert verb.isManagedResponse();
long createdAtNanos = approxTime.now();
long expiresAtNanos = verb.expiresAtNanos(createdAtNanos);
return new Message<>(new Header(0, epochSupplier.get(), verb, from, createdAtNanos, expiresAtNanos, 0, NO_PARAMS), payload);
@@ -324,7 +322,7 @@ public static Message remoteResponse(InetAddressAndPort from, Verb verb,
@VisibleForTesting
public static Message remoteResponseForTests(long id, InetAddressAndPort from, Verb verb, T payload)
{
- assert verb.isResponse();
+ assert verb.isManagedResponse();
long createdAtNanos = approxTime.now();
long expiresAtNanos = verb.expiresAtNanos(createdAtNanos);
return new Message<>(new Header(id, epochSupplier.get(), verb, from, createdAtNanos, expiresAtNanos, 0, NO_PARAMS), payload);
@@ -355,7 +353,8 @@ public Message responseWith(T payload)
public static Message responseWith(T payload, ResponseContext respondTo)
{
- return outWithParam(respondTo.id(), respondTo.verb().responseVerb, respondTo.expiresAtNanos(), payload, null, null);
+ int encodedFlags = respondTo.hasFlag(ARTIFICIAL_LATENCY) ? ARTIFICIAL_LATENCY.addTo(0) : 0;
+ return withParam(respondTo.id(), respondTo.verb().responseVerb, respondTo.expiresAtNanos(), payload, encodedFlags, null, null);
}
/** Builds a response Message with no payload, and all the right fields inferred from request Message */
@@ -382,7 +381,7 @@ public static Message failureResponse(RequestFailureReason reaso
static Message failureResponse(long id, long expiresAtNanos, RequestFailure reason)
{
- return outWithParam(id, Verb.FAILURE_RSP, expiresAtNanos, reason, null, null);
+ return withParam(id, Verb.FAILURE_RSP, expiresAtNanos, reason, 0, null, null);
}
public Message withPayload(V newPayload)
@@ -431,7 +430,7 @@ public Message withParams(Map values)
private static Map buildParams(ParamType type, Object value)
{
- Map params = NO_PARAMS;
+ EnumMap params = NO_PARAMS;
if (Tracing.isTracing())
params = Tracing.instance.addTraceHeaders(new EnumMap<>(ParamType.class));
@@ -590,9 +589,9 @@ boolean trackWarnings()
return MessageFlag.TRACK_WARNINGS.isIn(flags);
}
- boolean isFinal()
+ boolean permitsArtificialLatency()
{
- return !MessageFlag.NOT_FINAL.isIn(flags);
+ return ARTIFICIAL_LATENCY.isIn(flags);
}
@Nullable
@@ -665,7 +664,7 @@ public static class Builder
private InetAddressAndPort from;
private T payload;
private int flags = 0;
- private final Map params = new EnumMap<>(ParamType.class);
+ private final EnumMap params = new EnumMap<>(ParamType.class);
private long createdAtNanos;
private long expiresAtNanos;
private long id;
@@ -745,7 +744,7 @@ public Builder ofVerb(Verb verb)
this.verb = verb;
if (expiresAtNanos == 0 && verb != null && createdAtNanos != 0)
expiresAtNanos = verb.expiresAtNanos(createdAtNanos);
- if (!this.verb.isResponse() && from == null) // default to sending from self if we're a request verb
+ if (!this.verb.isManagedResponse() && from == null) // default to sending from self if we're a request verb
from = getBroadcastAddressAndPort();
return this;
}
diff --git a/src/java/org/apache/cassandra/net/MessageFlag.java b/src/java/org/apache/cassandra/net/MessageFlag.java
index 4c5762f9796e..2521c69168a6 100644
--- a/src/java/org/apache/cassandra/net/MessageFlag.java
+++ b/src/java/org/apache/cassandra/net/MessageFlag.java
@@ -25,15 +25,14 @@
public enum MessageFlag
{
/** a failure response should be sent back in case of failure */
- CALL_BACK_ON_FAILURE (0),
+ CALL_BACK_ON_FAILURE(0),
/** track repaired data - see CASSANDRA-14145 */
- TRACK_REPAIRED_DATA (1),
+ TRACK_REPAIRED_DATA (1),
/** allow creating warnings or aborting queries based off query - see CASSANDRA-16850 */
- TRACK_WARNINGS(2),
+ TRACK_WARNINGS (2),
/** whether this message should be sent on an URGENT channel despite its Verb default priority */
- URGENT(3),
- /** Allow a single callback to receive multiple responses until a final response is received **/
- NOT_FINAL(4)
+ URGENT (3),
+ ARTIFICIAL_LATENCY (4)
;
private final int id;
@@ -54,7 +53,7 @@ boolean isIn(int flags)
/**
* @return new flags value with this flag added
*/
- int addTo(int flags)
+ public int addTo(int flags)
{
return flags | (1 << id);
}
diff --git a/src/java/org/apache/cassandra/net/OutboundConnection.java b/src/java/org/apache/cassandra/net/OutboundConnection.java
index d5a95706a182..75b6993dfcfd 100644
--- a/src/java/org/apache/cassandra/net/OutboundConnection.java
+++ b/src/java/org/apache/cassandra/net/OutboundConnection.java
@@ -40,6 +40,7 @@
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.io.util.DataOutputBufferFixed;
import org.apache.cassandra.net.OutboundConnectionInitiator.Result.MessagingSuccess;
+import org.apache.cassandra.service.accord.debug.AccordRemoteTracing;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.Clock;
import org.apache.cassandra.utils.FBUtilities;
@@ -829,6 +830,7 @@ boolean doRun(Established established)
}
Tracing.instance.traceOutgoingMessage(next, messageSize, settings.connectTo);
+ AccordRemoteTracing.traceOnWire(next.header, settings.connectTo);
Message.serializer.serialize(next, out, messagingVersion);
if (sending.length() != sendingBytes + messageSize)
diff --git a/src/java/org/apache/cassandra/net/OutboundSink.java b/src/java/org/apache/cassandra/net/OutboundSink.java
index 34c72dbc3a11..3d19cd06d6e5 100644
--- a/src/java/org/apache/cassandra/net/OutboundSink.java
+++ b/src/java/org/apache/cassandra/net/OutboundSink.java
@@ -18,7 +18,7 @@
package org.apache.cassandra.net;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
-import java.util.function.BiPredicate;
+import java.util.function.Predicate;
import org.apache.cassandra.locator.InetAddressAndPort;
@@ -38,22 +38,71 @@ public interface Sink
void accept(Message> message, InetAddressAndPort to, ConnectionType connectionType);
}
- private static class Filtered implements Sink
+ public interface Filter
+ {
+ public boolean test(Message> message, InetAddressAndPort to, ConnectionType type);
+ }
+
+ public interface AsyncFilter
+ {
+ void filter(Message> message, InetAddressAndPort to, ConnectionType type, Sink next);
+ }
+
+ private static abstract class AbstractFiltered implements Sink
{
- final BiPredicate, InetAddressAndPort> condition;
final Sink next;
- private Filtered(BiPredicate, InetAddressAndPort> condition, Sink next)
+ private AbstractFiltered(Sink next)
{
- this.condition = condition;
this.next = next;
}
+ abstract AbstractFiltered withNext(Sink next);
+ }
+
+ private static class Filtered extends AbstractFiltered
+ {
+ final Filter condition;
+
+ private Filtered(Filter condition, Sink next)
+ {
+ super(next);
+ this.condition = condition;
+ }
+
public void accept(Message> message, InetAddressAndPort to, ConnectionType connectionType)
{
- if (condition.test(message, to))
+ if (condition.test(message, to, connectionType))
next.accept(message, to, connectionType);
}
+
+ @Override
+ AbstractFiltered withNext(Sink next)
+ {
+ return new Filtered(condition, next);
+ }
+ }
+
+ private static class AsyncFiltered extends AbstractFiltered
+ {
+ final AsyncFilter filter;
+
+ private AsyncFiltered(AsyncFilter filter, Sink next)
+ {
+ super(next);
+ this.filter = filter;
+ }
+
+ public void accept(Message> message, InetAddressAndPort to, ConnectionType connectionType)
+ {
+ filter.filter(message, to, connectionType, next);
+ }
+
+ @Override
+ AbstractFiltered withNext(Sink next)
+ {
+ return new AsyncFiltered(filter, next);
+ }
}
private volatile Sink sink;
@@ -70,16 +119,26 @@ public void accept(Message> message, InetAddressAndPort to, ConnectionType con
sink.accept(message, to, connectionType);
}
- public void add(BiPredicate, InetAddressAndPort> allow)
+ public void add(Filter allow)
{
sinkUpdater.updateAndGet(this, sink -> new Filtered(allow, sink));
}
- public void remove(BiPredicate, InetAddressAndPort> allow)
+ public void remove(Filter allow)
{
sinkUpdater.updateAndGet(this, sink -> without(sink, allow));
}
+ public void add(AsyncFilter filter)
+ {
+ sinkUpdater.updateAndGet(this, sink -> new AsyncFiltered(filter, sink));
+ }
+
+ public void remove(AsyncFilter filter)
+ {
+ sinkUpdater.updateAndGet(this, sink -> without(sink, filter));
+ }
+
public void clear()
{
sinkUpdater.updateAndGet(this, OutboundSink::clear);
@@ -92,17 +151,28 @@ private static Sink clear(Sink sink)
return sink;
}
- private static Sink without(Sink sink, BiPredicate, InetAddressAndPort> condition)
+ private static Sink without(Sink sink, Filter condition)
{
- if (!(sink instanceof Filtered))
- return sink;
+ return without(sink, f -> f instanceof Filtered && condition.equals(((Filtered) f).condition));
+ }
- Filtered filtered = (Filtered) sink;
- Sink next = without(filtered.next, condition);
- return condition.equals(filtered.condition) ? next
- : next == filtered.next
- ? sink
- : new Filtered(filtered.condition, next);
+ private static Sink without(Sink sink, AsyncFilter filter)
+ {
+ return without(sink, f -> f instanceof AsyncFiltered && filter.equals(((AsyncFiltered) f).filter));
}
+ private static Sink without(Sink sink, Predicate remove)
+ {
+ if (!(sink instanceof AbstractFiltered))
+ return sink;
+
+ AbstractFiltered filtered = (AbstractFiltered) sink;
+ if (remove.test(filtered))
+ return filtered.next;
+
+ Sink next = without(filtered.next, remove);
+ if (next == filtered.next)
+ return filtered;
+ return filtered.withNext(next);
+ }
}
diff --git a/src/java/org/apache/cassandra/net/ParamType.java b/src/java/org/apache/cassandra/net/ParamType.java
index 5d3c92316f9a..b7f666617e10 100644
--- a/src/java/org/apache/cassandra/net/ParamType.java
+++ b/src/java/org/apache/cassandra/net/ParamType.java
@@ -20,6 +20,7 @@
import javax.annotation.Nullable;
import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.service.accord.debug.AccordRemoteTracing;
import org.apache.cassandra.service.writes.thresholds.WriteThresholdMapSerializer;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.Int32Serializer;
@@ -57,7 +58,9 @@ public enum ParamType
TOO_MANY_REFERENCED_INDEXES_WARN (16, Int32Serializer.serializer),
TOO_MANY_REFERENCED_INDEXES_FAIL (17, Int32Serializer.serializer),
WRITE_SIZE_WARN (18, WriteThresholdMapSerializer.serializer),
- WRITE_TOMBSTONE_WARN (19, WriteThresholdMapSerializer.serializer);
+ WRITE_TOMBSTONE_WARN (19, WriteThresholdMapSerializer.serializer),
+ ACCORD_TRACING (20, AccordRemoteTracing.tracingSerializer),
+ ;
final int id;
final IVersionedSerializer serializer;
diff --git a/src/java/org/apache/cassandra/net/ResponseContext.java b/src/java/org/apache/cassandra/net/ResponseContext.java
index c9b657a2ada8..a493f5b39299 100644
--- a/src/java/org/apache/cassandra/net/ResponseContext.java
+++ b/src/java/org/apache/cassandra/net/ResponseContext.java
@@ -17,9 +17,18 @@
*/
package org.apache.cassandra.net;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import accord.api.MessageSink;
+import accord.local.Node;
+import accord.messages.Reply;
import accord.messages.ReplyContext;
import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.service.accord.AccordMessageSink;
+
+import static java.util.concurrent.TimeUnit.NANOSECONDS;
public interface ResponseContext extends ReplyContext
{
@@ -27,4 +36,18 @@ public interface ResponseContext extends ReplyContext
InetAddressAndPort from();
Verb verb();
long expiresAtNanos();
+ boolean hasFlag(MessageFlag flag);
+ Map params();
+
+ @Override
+ default long expiresAt(TimeUnit units)
+ {
+ return units.convert(expiresAtNanos(), NANOSECONDS);
+ }
+
+ @Override
+ default void reply(Node.Id to, MessageSink sink, Reply success, Throwable failure)
+ {
+ ((AccordMessageSink)sink).reply(to, this, success, failure);
+ }
}
diff --git a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java
index 2550ca2929b8..fb36c67c669f 100644
--- a/src/java/org/apache/cassandra/net/ResponseVerbHandler.java
+++ b/src/java/org/apache/cassandra/net/ResponseVerbHandler.java
@@ -61,11 +61,7 @@ class ResponseVerbHandler implements IVerbHandler
@Override
public void doVerb(Message message)
{
- RequestCallbacks.CallbackInfo callbackInfo;
- if (message.header.isFinal())
- callbackInfo = MessagingService.instance().callbacks.remove(message.id(), message.from());
- else
- callbackInfo = MessagingService.instance().callbacks.get(message.id(), message.from());
+ RequestCallbacks.CallbackInfo callbackInfo = MessagingService.instance().callbacks.remove(message.id(), message.from());
if (callbackInfo == null)
{
String msg = "Callback already removed for {} (from {})";
diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java
index 97eea759aa29..1bb3b7a8900a 100644
--- a/src/java/org/apache/cassandra/net/Verb.java
+++ b/src/java/org/apache/cassandra/net/Verb.java
@@ -84,6 +84,7 @@
import org.apache.cassandra.service.EchoVerbHandler;
import org.apache.cassandra.service.SnapshotVerbHandler;
import org.apache.cassandra.service.accord.AccordService;
+import org.apache.cassandra.service.accord.debug.AccordRemoteTracing;
import org.apache.cassandra.service.accord.interop.AccordInteropApply;
import org.apache.cassandra.service.accord.interop.AccordInteropRead;
import org.apache.cassandra.service.accord.interop.AccordInteropReadRepair;
@@ -104,6 +105,7 @@
import org.apache.cassandra.service.accord.serializers.PreacceptSerializers;
import org.apache.cassandra.service.accord.serializers.ReadDataSerializer;
import org.apache.cassandra.service.accord.serializers.RecoverySerializers;
+import org.apache.cassandra.service.accord.serializers.RemoteSuccessSerializers;
import org.apache.cassandra.service.accord.serializers.SetDurableSerializers;
import org.apache.cassandra.service.accord.serializers.SimpleReplySerializer;
import org.apache.cassandra.service.accord.serializers.Version;
@@ -373,9 +375,11 @@ public enum Verb
ACCORD_INTEROP_APPLY_REQ (166, P2, writeTimeout, IMMEDIATE, () -> accordEmbedded(AccordInteropApply.serializer), AccordService::requestHandlerOrNoop, ACCORD_APPLY_RSP),
ACCORD_FETCH_WATERMARKS_RSP (167, P0, shortTimeout, FETCH_METADATA, () -> accordEmbedded(WatermarkCollector.serializer), RESPONSE_HANDLER),
// NoPayload can not be prefixed with accord version as it is special cased in C* messaging
- ACCORD_FETCH_WATERMARKS_REQ (168, P0, shortTimeout, FETCH_METADATA, () -> NoPayload.serializer, AccordService::watermarkHandlerOrNoop, ACCORD_FETCH_WATERMARKS_RSP),
+ ACCORD_FETCH_WATERMARKS_REQ (168, P0, shortTimeout, FETCH_METADATA, () -> NoPayload.serializer, AccordService::watermarkHandlerOrNoop, ACCORD_FETCH_WATERMARKS_RSP),
ACCORD_FETCH_TOPOLOGY_RSP (169, P0, shortTimeout, FETCH_METADATA, () -> accordEmbedded(FetchTopologies.responseSerializer), RESPONSE_HANDLER),
ACCORD_FETCH_TOPOLOGY_REQ (170, P0, shortTimeout, FETCH_METADATA, () -> accordEmbedded(FetchTopologies.serializer), () -> FetchTopologies.handler, ACCORD_FETCH_TOPOLOGY_RSP),
+ ACCORD_REMOTE_SUCCESS_REQ (173, P0, shortTimeout, IMMEDIATE, () -> accordEmbedded(RemoteSuccessSerializers.remoteSuccess), AccordService::requestHandlerOrNoop),
+ ACCORD_REMOTE_TRACE(174, P0, shortTimeout, IMMEDIATE, () -> AccordRemoteTracing.traceSerializer, () -> AccordRemoteTracing.traceMessageHandler),
DICTIONARY_UPDATE_RSP (171, P1, rpcTimeout, REQUEST_RESPONSE, () -> NoPayload.serializer, RESPONSE_HANDLER ),
DICTIONARY_UPDATE_REQ (172, P1, rpcTimeout, MISC, () -> CompressionDictionaryUpdateMessage.serializer, () -> CompressionDictionaryUpdateVerbHandler.instance, DICTIONARY_UPDATE_RSP ),
@@ -440,6 +444,7 @@ public enum Kind
private final Supplier extends IVerbHandler>> handler;
public final Verb responseVerb;
+ private final boolean isResponse;
private final ToLongFunction expiration;
@@ -488,6 +493,8 @@ public enum Kind
this.responseVerb = responseVerb;
this.expiration = expiration;
this.kind = kind;
+ // this is a little hacky, but reduces the number of parameters up top
+ this.isResponse = name().endsWith("_RSP") || handler == RESPONSE_HANDLER;
}
public IVersionedAsymmetricSerializer serializer()
@@ -516,9 +523,14 @@ public long expiresAfter(TimeUnit units)
}
// this is a little hacky, but reduces the number of parameters up top
+ public boolean isManagedResponse()
+ {
+ return handler == RESPONSE_HANDLER;
+ }
+
public boolean isResponse()
{
- return handler.get() == ResponseVerbHandler.instance;
+ return isResponse;
}
@VisibleForTesting
diff --git a/src/java/org/apache/cassandra/service/ClientWarn.java b/src/java/org/apache/cassandra/service/ClientWarn.java
index aff945c4d058..a6de392e65bd 100644
--- a/src/java/org/apache/cassandra/service/ClientWarn.java
+++ b/src/java/org/apache/cassandra/service/ClientWarn.java
@@ -40,7 +40,7 @@ public State get()
public void set(State value)
{
ExecutorLocals current = ExecutorLocals.current();
- ExecutorLocals.Impl.set(current.traceState, value);
+ ExecutorLocals.Impl.set(current.traceState, value, current.eligibleForArtificialLatency);
}
public void warn(String text)
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index 5c1ebc9ed5f3..70de57d6be1a 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -21,7 +21,6 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
-import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
@@ -62,7 +61,7 @@
import org.apache.cassandra.batchlog.BatchlogManager;
import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask;
import org.apache.cassandra.concurrent.Stage;
-import org.apache.cassandra.config.AccordSpec;
+import org.apache.cassandra.config.AccordConfig;
import org.apache.cassandra.config.CassandraRelevantProperties;
import org.apache.cassandra.config.Config;
import org.apache.cassandra.config.DatabaseDescriptor;
@@ -129,6 +128,7 @@
import org.apache.cassandra.metrics.DenylistMetrics;
import org.apache.cassandra.metrics.ReadRepairMetrics;
import org.apache.cassandra.metrics.StorageMetrics;
+import org.apache.cassandra.net.ArtificialLatency;
import org.apache.cassandra.net.ForwardingInfo;
import org.apache.cassandra.net.Message;
import org.apache.cassandra.net.MessageFlag;
@@ -759,7 +759,7 @@ private static PaxosBallotAndContention beginAndRepairPaxos(Dispatcher.RequestTi
if (Iterables.size(missingMRC) > 0)
{
Tracing.trace("Repairing replicas that missed the most recent commit");
- sendCommit(mostRecent, missingMRC);
+ sendCommit(mostRecent, consistencyForPaxos, missingMRC);
// TODO: provided commits don't invalid the prepare we just did above (which they don't), we could just wait
// for all the missingMRC to acknowledge this commit and then move on with proposing our value. But that means
// adding the ability to have commitPaxos block, which is exactly CASSANDRA-5442 will do. So once we have that
@@ -782,7 +782,7 @@ private static PaxosBallotAndContention beginAndRepairPaxos(Dispatcher.RequestTi
/**
* Unlike commitPaxos, this does not wait for replies
*/
- private static void sendCommit(Commit commit, Iterable replicas)
+ private static void sendCommit(Commit commit, ConsistencyLevel consistencyForPaxos, Iterable replicas)
{
Message message = Message.out(PAXOS_COMMIT_REQ, commit);
for (InetAddressAndPort target : replicas)
@@ -796,7 +796,6 @@ private static PrepareCallback preparePaxos(Commit toPrepare, ReplicaPlan.ForPax
Message message = Message.out(PAXOS_PREPARE_REQ, toPrepare);
boolean hasLocalRequest = false;
-
for (Replica replica: replicaPlan.contacts())
{
if (replica.isSelf())
@@ -838,6 +837,7 @@ private static boolean proposePaxos(Commit proposal, ReplicaPlan.ForPaxosWrite r
{
ProposeCallback callback = new ProposeCallback(replicaPlan.contacts().size(), replicaPlan.requiredParticipants(), !backoffIfPartial, replicaPlan.consistencyLevel(), requestTime);
Message message = Message.out(PAXOS_PROPOSE_REQ, proposal);
+
for (Replica replica : replicaPlan.contacts())
{
if (replica.isSelf())
@@ -1355,7 +1355,7 @@ public static void dispatchMutationsWithRetryOnDifferentSystem(List extends IM
private static void checkMixedTimeSourceHandling()
{
- AccordSpec.MixedTimeSourceHandling handling = DatabaseDescriptor.getAccord().mixedTimeSourceHandling;
+ AccordConfig.MixedTimeSourceHandling handling = DatabaseDescriptor.getAccord().mixedTimeSourceHandling;
switch (handling)
{
case log:
@@ -1363,7 +1363,7 @@ private static void checkMixedTimeSourceHandling()
{
ClientWarn.instance.warn(UNSAFE_MIXED_MUTATIONS_MSG);
logger.warn(UNSAFE_MIXED_MUTATIONS_MSG);
- if (handling == AccordSpec.MixedTimeSourceHandling.reject)
+ if (handling == AccordConfig.MixedTimeSourceHandling.reject)
throw new InvalidRequestException(UNSAFE_MIXED_MUTATIONS_MSG);
}
break;
@@ -1862,10 +1862,10 @@ public static void sendToHintedReplicas(final Mutation mutation,
// belongs on a different server
if (message == null)
{
- message = Message.outWithFlags(MUTATION_REQ,
- mutation,
- requestTime,
- Collections.singletonList(MessageFlag.CALL_BACK_ON_FAILURE));
+ message = Message.outWithFlag(MUTATION_REQ,
+ mutation,
+ requestTime,
+ MessageFlag.CALL_BACK_ON_FAILURE);
}
String dc = DatabaseDescriptor.getLocator().location(destination.endpoint()).datacenter;
@@ -2356,7 +2356,7 @@ private static ConsensusAttemptResult legacyReadWithPaxos(SinglePartitionReadCom
try
{
- final ConsistencyLevel consistencyForReplayCommitsOrFetch = consistencyLevel == ConsistencyLevel.LOCAL_SERIAL
+ final ConsistencyLevel consistencyForReplayCommitsOrFetch = consistencyLevel.isDatacenterLocal()
? ConsistencyLevel.LOCAL_QUORUM
: ConsistencyLevel.QUORUM;
@@ -3408,6 +3408,18 @@ private static Future submitHint(HintRunnable runnable)
public Long getTruncateRpcTimeout() { return DatabaseDescriptor.getTruncateRpcTimeout(MILLISECONDS); }
public void setTruncateRpcTimeout(Long timeoutInMillis) { DatabaseDescriptor.setTruncateRpcTimeout(timeoutInMillis); }
+ public boolean getArtificialLatencyEnabled() { return ArtificialLatency.isEnabled(); }
+ public void setArtificialLatencyEnabled(boolean enabled) { ArtificialLatency.setEnabled(enabled); }
+
+ public String getArtificialLatencyVerbs() { return ArtificialLatency.getArtificialLatencyVerbs(); }
+ public void setArtificialLatencyVerbs(String commaDelimitedVerbs) { ArtificialLatency.setArtificialLatencyVerbs(commaDelimitedVerbs); }
+
+ public String getArtificialLatencies() { return ArtificialLatency.getArtificialLatencies(); }
+ public void setArtificialLatencies(String latencies) { ArtificialLatency.setArtificialLatencies(latencies); }
+
+ public boolean getAllowArtificialLatencyForAllConsistencyLevels() { return ArtificialLatency.getArtificialLatencyOnlyPermittedConsistencyLevels(); }
+ public void setAllowArtificialLatencyForAllConsistencyLevels(boolean onlyPermitted) { ArtificialLatency.setArtificialLatencyOnlyPermittedConsistencyLevels(onlyPermitted); }
+
public Long getNativeTransportMaxConcurrentConnections() { return DatabaseDescriptor.getNativeTransportMaxConcurrentConnections(); }
public void setNativeTransportMaxConcurrentConnections(Long nativeTransportMaxConcurrentConnections) { DatabaseDescriptor.setNativeTransportMaxConcurrentConnections(nativeTransportMaxConcurrentConnections); }
diff --git a/src/java/org/apache/cassandra/service/StorageProxyMBean.java b/src/java/org/apache/cassandra/service/StorageProxyMBean.java
index 1c4887f1e177..eabe28ece719 100644
--- a/src/java/org/apache/cassandra/service/StorageProxyMBean.java
+++ b/src/java/org/apache/cassandra/service/StorageProxyMBean.java
@@ -52,6 +52,15 @@ public interface StorageProxyMBean
public Long getTruncateRpcTimeout();
public void setTruncateRpcTimeout(Long timeoutInMillis);
+ public boolean getArtificialLatencyEnabled();
+ public void setArtificialLatencyEnabled(boolean enabled);
+ public String getArtificialLatencyVerbs();
+ public void setArtificialLatencyVerbs(String commaDelimitedVerbs);
+ public String getArtificialLatencies();
+ public void setArtificialLatencies(String latencies);
+ public boolean getAllowArtificialLatencyForAllConsistencyLevels();
+ public void setAllowArtificialLatencyForAllConsistencyLevels(boolean onlyPermitted);
+
public void setNativeTransportMaxConcurrentConnections(Long nativeTransportMaxConcurrentConnections);
public Long getNativeTransportMaxConcurrentConnections();
diff --git a/src/java/org/apache/cassandra/service/accord/AccordCache.java b/src/java/org/apache/cassandra/service/accord/AccordCache.java
index 0ff30acde892..6b6c48b2fc4b 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordCache.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordCache.java
@@ -67,6 +67,7 @@
import org.apache.cassandra.service.accord.AccordCache.Adapter.Shrink;
import org.apache.cassandra.service.accord.AccordCacheEntry.LoadExecutor;
import org.apache.cassandra.service.accord.AccordCacheEntry.Status;
+import org.apache.cassandra.service.accord.AccordSafeCommandsForKey.CommandsForKeyCacheEntry;
import org.apache.cassandra.service.accord.events.CacheEvents;
import org.apache.cassandra.service.accord.journal.CommandChangeWriter;
import org.apache.cassandra.service.accord.journal.CommandChanges;
@@ -1193,7 +1194,7 @@ public Object fullShrink(RoutingKey key, CommandsForKey value)
return null;
TxnId last = value.size() == 0 ? null : value.get(value.size() - 1);
- TxnId minUndecided = value.minUndecided();
+ TxnId minUndecided = value.minUndecidedManaged();
int lastSize = (int) CommandSerializers.txnId.serializedSize(last);
int minUndecidedSize = (int) CommandSerializers.txnId.serializedSize(minUndecided);
ByteBuffer result = Serialize.toBytesWithoutKey(lastSize + minUndecidedSize, value.maximalPrune());
@@ -1249,11 +1250,19 @@ public Comparator keyComparator()
{
return RoutingKey::compareAsRoutingKey;
}
+
+ @Override
+ public AccordCacheEntry newEntry(RoutingKey key, Type.Instance owner)
+ {
+ CommandsForKeyCacheEntry entry = new CommandsForKeyCacheEntry(key, owner);
+ entry.readyToLoad();
+ return entry;
+ }
}
public static class CommandAdapter implements Adapter
{
- private static int SHRINK_WITHOUT_LOCK = -1;
+ private static final int SHRINK_WITHOUT_LOCK = -1;
public static final CommandAdapter COMMAND_ADAPTER = new CommandAdapter();
diff --git a/src/java/org/apache/cassandra/service/accord/AccordCommandStore.java b/src/java/org/apache/cassandra/service/accord/AccordCommandStore.java
index 1904709b8c66..d3676ef23b9a 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordCommandStore.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordCommandStore.java
@@ -60,6 +60,8 @@
import accord.local.CommandSummaries;
import accord.local.MaxConflicts;
import accord.local.MaxDecidedRX;
+import accord.local.MinimalCommand;
+import accord.local.MinimalCommand.MinimalWithDeps;
import accord.local.NodeCommandStoreService;
import accord.local.PreLoadContext;
import accord.local.PreLoadContext.Empty;
@@ -67,7 +69,6 @@
import accord.local.RedundantBefore.Bounds;
import accord.local.RedundantStatus.Property;
import accord.local.RedundantStatus.SomeStatus;
-import accord.local.RejectBefore;
import accord.local.SafeCommandStore;
import accord.local.cfk.CommandsForKey;
import accord.primitives.PartialTxn;
@@ -87,8 +88,8 @@
import accord.utils.async.AsyncResult;
import accord.utils.async.AsyncResults.CountingResult;
-import org.apache.cassandra.config.AccordSpec;
-import org.apache.cassandra.config.AccordSpec.JournalSpec.ReplayMode;
+import org.apache.cassandra.config.AccordConfig;
+import org.apache.cassandra.config.AccordConfig.JournalConfig.ReplayMode;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.io.util.File;
@@ -255,7 +256,7 @@ public AccordCommandStore(int id,
this.exclusiveExecutor = sharedExecutor.executor(id);
{
- AccordSpec.RangeIndexMode mode = getAccord().range_index_mode;
+ AccordConfig.RangeIndexMode mode = getAccord().range_index_mode;
switch (mode)
{
default: throw new UnhandledEnum(mode);
@@ -388,7 +389,7 @@ CommandsForKey loadCommandsForKey(RoutableKey key)
RedundantBefore.QuickBounds bounds = safeGetRedundantBefore().get(key);
if (!Invariants.expect(bounds != null, "No RedundantBefore information found when loading key %s", key))
return cfk;
- return cfk.withGcBeforeAtLeast(bounds.gcBefore, false);
+ return cfk.withCleanCfkBeforeAtLeast(bounds.cleanCfkBefore(), false);
}
boolean validateCommandsForKey(RoutableKey key, CommandsForKey evicting)
@@ -422,6 +423,18 @@ public AsyncChain chain(PreLoadContext preLoadContext, Consumer super Sa
return AccordTask.create(this, preLoadContext, consumer).chain();
}
+ @Override
+ public AsyncChain priorityChain(PreLoadContext preLoadContext, Consumer super SafeCommandStore> consumer)
+ {
+ return AccordTask.create(this, preLoadContext, consumer).priorityChain();
+ }
+
+ @Override
+ public AsyncChain priorityChain(PreLoadContext preLoadContext, Function super SafeCommandStore, T> function)
+ {
+ return AccordTask.create(this, preLoadContext, function).priorityChain();
+ }
+
@Override
public AsyncChain chain(Callable call)
{
@@ -434,6 +447,12 @@ public void execute(Runnable run)
taskExecutor().execute(run);
}
+ @Override
+ public boolean tryExecuteImmediately(Runnable run)
+ {
+ return taskExecutor().tryExecuteImmediately(run);
+ }
+
public AccordSafeCommandStore begin(AccordTask> operation, @Nullable CommandSummaries commandsForRanges)
{
require(current == null);
@@ -575,12 +594,12 @@ public static Command prepareToCache(Command command)
return command;
}
- public Command.Minimal loadMinimal(TxnId txnId)
+ public MinimalCommand loadMinimal(TxnId txnId)
{
return journal.loadMinimal(id, txnId, safeGetRedundantBefore(), durableBefore());
}
- public Command.MinimalWithDeps loadMinimalWithDeps(TxnId txnId)
+ public MinimalWithDeps loadMinimalWithDeps(TxnId txnId)
{
return journal.loadMinimalWithDeps(id, txnId, safeGetRedundantBefore(), durableBefore());
}
@@ -805,7 +824,6 @@ AsyncChain saveState(Descriptor descriptor)
tmpSaveDir.createDirectoriesIfNotExists();
writeOne(new File(tmpSaveDir, "max_decidedrx"), unsafeGetMaxDecidedRX(), maxDecidedRX);
writeOne(new File(tmpSaveDir, "max_conflicts"), unsafeGetMaxConflicts(), maxConflicts);
- writeOne(new File(tmpSaveDir, "reject_before"), unsafeGetRejectBefore(), rejectBefore);
writeList(new File(tmpSaveDir, "listeners"), ((DefaultLocalListeners)listeners).snapshot(), txnListener);
writeList(new File(tmpSaveDir, "progress_log"), ((DefaultProgressLog)progressLog).snapshot(), progressLogState);
rangeIndex.save(new File(tmpSaveDir, "range_index"));
@@ -872,7 +890,7 @@ AsyncChain> restoreState()
File savePoint = savePoints[savePoints.length - 1];
long segment = Long.parseLong(savePoint.name());
- MaxDecidedRX mxd; MaxConflicts mxc; RejectBefore rjb;
+ MaxDecidedRX mxd; MaxConflicts mxc;
List dll; List dpl; Object rgi;
RedundantBefore rdb;
try
@@ -880,7 +898,11 @@ AsyncChain> restoreState()
logger.info("{} loading state from {}", this, savePoint);
mxd = readOne(new File(savePoint, "max_decidedrx"), maxDecidedRX);
mxc = readOne(new File(savePoint, "max_conflicts"), maxConflicts);
- rjb = readOne(new File(savePoint, "reject_before"), rejectBefore);
+ {
+ File rjbf = new File(savePoint, "reject_before");
+ if (rjbf.exists())
+ mxc = mxc.with(readOne(rjbf, rejectBefore));
+ }
dll = readList(new File(savePoint, "listeners"), txnListener);
dpl = readList(new File(savePoint, "progress_log"), progressLogState);
rgi = rangeIndex.load(new File(savePoint, "range_index"));
@@ -898,7 +920,6 @@ AsyncChain> restoreState()
rangeIndex.restore(rgi);
unsafeSetMaxDecidedRX(mxd);
unsafeSetMaxConflicts(mxc);
- unsafeSetRejectBefore(rjb);
((DefaultLocalListeners) listeners).restore(dll);
boolean unsetCatchup = ((DefaultProgressLog) progressLog).setModeExclusive(safeStore, CATCH_UP);
((DefaultProgressLog) progressLog).restore(safeStore, dpl);
diff --git a/src/java/org/apache/cassandra/service/accord/AccordCommandStores.java b/src/java/org/apache/cassandra/service/accord/AccordCommandStores.java
index df2ef42fb98c..fa007bdab93a 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordCommandStores.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordCommandStores.java
@@ -48,14 +48,14 @@
import org.apache.cassandra.cache.CacheSize;
import org.apache.cassandra.concurrent.ScheduledExecutors;
import org.apache.cassandra.concurrent.Shutdownable;
-import org.apache.cassandra.config.AccordSpec.QueueShardModel;
+import org.apache.cassandra.config.AccordConfig.QueueShardModel;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.journal.Descriptor;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.service.accord.AccordCommandStore.DurablyAppliedTo;
import org.apache.cassandra.service.accord.AccordExecutor.AccordExecutorFactory;
-import static org.apache.cassandra.config.AccordSpec.QueueShardModel.THREAD_PER_SHARD;
+import static org.apache.cassandra.config.AccordConfig.QueueShardModel.THREAD_PER_SHARD;
import static org.apache.cassandra.config.DatabaseDescriptor.getAccordQueueShardCount;
import static org.apache.cassandra.config.DatabaseDescriptor.getAccordQueueSubmissionModel;
import static org.apache.cassandra.service.accord.AccordExecutor.Mode.RUN_WITHOUT_LOCK;
diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutor.java b/src/java/org/apache/cassandra/service/accord/AccordExecutor.java
index c53d04cd6183..62fc80c20491 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordExecutor.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordExecutor.java
@@ -25,6 +25,7 @@
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.RejectedExecutionException;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.LockSupport;
@@ -46,6 +47,13 @@
import accord.local.PreLoadContext;
import accord.local.SequentialAsyncExecutor;
import accord.local.cfk.CommandsForKey;
+import accord.messages.Accept;
+import accord.messages.Commit;
+import accord.messages.MessageType;
+import accord.messages.MessageType.StandardMessage;
+import accord.messages.Request;
+import accord.primitives.Ballot;
+import accord.primitives.SaveStatus;
import accord.primitives.TxnId;
import accord.utils.ArrayBuffers.BufferList;
import accord.utils.IntrusivePriorityHeap;
@@ -69,6 +77,8 @@
import org.apache.cassandra.concurrent.DebuggableTask.DebuggableTaskRunner;
import org.apache.cassandra.concurrent.ExecutorLocals;
import org.apache.cassandra.concurrent.Shutdownable;
+import org.apache.cassandra.config.AccordConfig.QueuePriorityModel;
+import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.metrics.AccordCacheMetrics;
import org.apache.cassandra.metrics.AccordExecutorMetrics;
import org.apache.cassandra.metrics.AccordReplicaMetrics;
@@ -80,6 +90,9 @@
import org.apache.cassandra.service.accord.AccordCacheEntry.LoadExecutor;
import org.apache.cassandra.service.accord.AccordCacheEntry.SaveExecutor;
import org.apache.cassandra.service.accord.AccordCacheEntry.UniqueSave;
+import org.apache.cassandra.service.accord.debug.DebugExecution.DebugExecutor;
+import org.apache.cassandra.service.accord.debug.DebugExecution.DebugSequentialExecutor;
+import org.apache.cassandra.service.accord.debug.DebugExecution.DebugTask;
import org.apache.cassandra.utils.Closeable;
import org.apache.cassandra.utils.MonotonicClock;
import org.apache.cassandra.utils.WithResources;
@@ -90,6 +103,7 @@
import io.netty.util.concurrent.FastThreadLocal;
import static accord.utils.Invariants.createIllegalState;
+import static org.apache.cassandra.config.AccordConfig.QueuePriorityModel.PHASE_HLC_FIFO;
import static org.apache.cassandra.service.accord.AccordCache.CommandAdapter.COMMAND_ADAPTER;
import static org.apache.cassandra.service.accord.AccordCache.CommandsForKeyAdapter.CFK_ADAPTER;
import static org.apache.cassandra.service.accord.AccordCache.registerJfrListener;
@@ -99,6 +113,8 @@
import static org.apache.cassandra.service.accord.AccordTask.State.SCANNING_RANGES;
import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_LOAD;
import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_RUN;
+import static org.apache.cassandra.service.accord.debug.DebugExecution.DEBUG_EXECUTION;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
/**
* NOTE: We assume that NO BLOCKING TASKS are submitted to this executor AND WAITED ON by another task executing on this executor.
@@ -107,7 +123,12 @@
public abstract class AccordExecutor implements CacheSize, LoadExecutor, Boolean>, SaveExecutor, Shutdownable, AbstractAsyncExecutor
{
private static final Logger logger = LoggerFactory.getLogger(AccordExecutor.class);
+
+ private static final long PRIORITY_BITS = 0x7000000000000000L;
+ private static final QueuePriorityModel PRIORITY_MODEL = DatabaseDescriptor.getAccord().queue_priority_model;
+ private static final long AGE_TO_FIFO = DatabaseDescriptor.getAccord().queue_priority_age_to_fifo.to(TimeUnit.MICROSECONDS);
public static final ShardedDecayingHistograms HISTOGRAMS = new ShardedDecayingHistograms();
+ private static final FastThreadLocal paranoidPriorityInversionCheck = new FastThreadLocal<>();
public interface AccordExecutorFactory
{
@@ -130,7 +151,7 @@ public ExclusiveGlobalCaches(AccordExecutor executor, AccordCache global, Accord
@Override
public void close()
{
- executor.beforeUnlock();
+ executor.beforeUnlockExternal();
global.tryShrinkOrEvict(executor.lock);
executor.unlock();
}
@@ -150,6 +171,25 @@ public GlobalCaches(AccordCache global, AccordCache.Type paranoidPriorityInversionCheck = new FastThreadLocal<>();
+ abstract boolean isInLoop();
final Lock unsafeLock()
{
@@ -259,24 +283,34 @@ final Lock unsafeLock()
final void lock()
{
- if (Invariants.isParanoid())
- {
- Lock locked = paranoidPriorityInversionCheck.getAndSet(lock);
- Invariants.require(locked == null || locked == lock, "Tried to take multiple AccordExecutor locks on same thread - this is dangerous for progress");
- }
+ if (Invariants.isParanoid()) paranoidLockExclusive();
//noinspection LockAcquiredButNotSafelyReleased
lock.lock();
+ if (DEBUG_EXECUTION) debug.onEnterLock();
}
final void unlock()
{
+ if (Invariants.isParanoid()) paranoidUnlockExclusive();
+ if (DEBUG_EXECUTION) debug.onExitLock();
lock.unlock();
+ }
+
+ final void paranoidLockExclusive()
+ {
+ Lock locked = paranoidPriorityInversionCheck.getAndSet(lock);
+ Invariants.require(locked == null || locked == lock, "Tried to take multiple AccordExecutor locks on same thread - this is dangerous for progress");
+ }
+
+ final void paranoidUnlockExclusive()
+ {
paranoidPriorityInversionCheck.set(null);
}
final boolean tryLock()
{
boolean result = lock.tryLock();
+ if (DEBUG_EXECUTION && result) debug.onEnterLock();
if (Invariants.isParanoid())
{
if (result)
@@ -310,26 +344,30 @@ public AccordCache cacheUnsafe()
return cache;
}
- boolean hasWaitingToRun()
+ final boolean hasWaitingToRun()
{
updateWaitingToRunExclusive();
return !waitingToRun.isEmpty();
}
- Task pollWaitingToRunExclusive()
+ void updateWaitingToRunExclusive()
+ {
+ // TODO (expected): this should not be invoked on every update of waiting to run
+ maybeUnpauseLoading();
+ }
+
+ final Task pollWaitingToRunExclusive()
{
updateWaitingToRunExclusive();
Task next = waitingToRun.poll();
if (next != null)
+ {
+ if (DEBUG_EXECUTION) next.debug.onPolled();
next.addToQueue(running);
+ }
return next;
}
- void updateWaitingToRunExclusive()
- {
- maybeUnpauseLoading();
- }
-
public Stream extends DebuggableTaskRunner> active()
{
return Stream.of();
@@ -387,7 +425,8 @@ public void afterSubmittedAndConsequences(Runnable run)
if (waitingForCompletion == null)
waitingForCompletion = new ArrayDeque<>();
- int position = nextPosition;
+ long position = nextPosition;
+ minPosition = position;
waitingForCompletion.add(new WaitForCompletion(position, run));
}
finally
@@ -409,7 +448,7 @@ void maybeUnpauseLoading()
}
public abstract boolean hasTasks();
- abstract void beforeUnlock();
+ abstract void beforeUnlockExternal();
abstract boolean isOwningThread();
private void enqueueLoadsExclusive()
@@ -569,27 +608,29 @@ private void updateQueue(AccordTask> task)
private void waitingToRun(AccordTask task)
{
+ task.onWaitingToRun();
task.addToQueue(task.commandStore.exclusiveExecutor);
}
private void waitingToRun(SubmittableTask task, @Nullable SequentialExecutor queue)
{
+ task.onWaitingToRun();
task.addToQueue(queue == null ? waitingToRun : queue);
}
public SequentialExecutor executor()
{
- return new SequentialExecutor();
+ return new SequentialExecutor(this);
}
public SequentialExecutor executor(int commandStoreId)
{
- return new SequentialExecutor(commandStoreId);
+ return new SequentialExecutor(this, commandStoreId);
}
public SequentialAsyncExecutor newSequentialExecutor()
{
- return new SequentialExecutor();
+ return new SequentialExecutor(this);
}
public void cancel(AccordTask task)
@@ -639,9 +680,25 @@ void submit(AccordTask operation)
submit(AccordExecutor::submitExclusive, i -> i, operation);
}
+ void submitPriority(AccordTask operation)
+ {
+ submit(AccordExecutor::submitPriorityExclusive, i -> i, operation);
+ }
+
void submitExclusive(AccordTask> task)
{
assignQueuePosition(task);
+ submitInternalExclusive(task);
+ }
+
+ void submitPriorityExclusive(AccordTask> task)
+ {
+ assignMinQueuePosition(task);
+ submitInternalExclusive(task);
+ }
+
+ private void submitInternalExclusive(AccordTask> task)
+ {
task.setupExclusive();
++tasks;
updateQueue(task);
@@ -670,21 +727,126 @@ T submitPlainExclusive(Task parent, T task)
Invariants.require(isOwningThread());
++tasks;
if (parent != null) inheritQueuePosition(parent, task);
- else assignNewQueuePosition(task);
+ else assignFifoQueuePosition(task);
+ task.onWaitingToRun();
waitingToRun.append(task);
return task;
}
private void assignQueuePosition(Task task)
{
- if (task.queuePosition == 0)
- assignNewQueuePosition(task);
+ if (task.queuePosition != 0) updateNextPosition(task);
+ else assignFifoQueuePosition(task);
+ }
+
+ private void assignQueuePosition(AccordTask> task)
+ {
+ if (task.queuePosition != 0) updateNextPosition(task);
+ else
+ {
+ long priority_bits = PRIORITY_BITS;
+ TxnId txnId = null;
+ switch (PRIORITY_MODEL)
+ {
+ case ORIG_PHASE_HLC_FIFO:
+ case PHASE_HLC_FIFO:
+ {
+ // TODO (expected): we should process messages for a TxnId together, to avoid processing delayed messages out of order
+ PreLoadContext context = task.preLoadContext();
+ if (context instanceof Request)
+ {
+ MessageType type = ((Request) context).type();
+ if (type instanceof StandardMessage)
+ {
+ TxnId txnId0 = context.primaryTxnId();
+ switch ((StandardMessage)type)
+ {
+ case APPLY_REQ:
+ {
+ priority_bits = 0L;
+ txnId = txnId0;
+ break;
+ }
+ case READ_EPHEMERAL_REQ:
+ case READ_REQ:
+ case STABLE_THEN_READ_REQ:
+ {
+ priority_bits = 1000000000000000L;
+ txnId = txnId0;
+ break;
+ }
+ case COMMIT_REQ:
+ {
+ Commit commit = (Commit) context;
+ if (PRIORITY_MODEL == PHASE_HLC_FIFO || commit.ballot.equals(Ballot.ZERO))
+ txnId = commit.txnId;
+ if (commit.kind.saveStatus == SaveStatus.Stable) priority_bits = 1000000000000L;
+ else priority_bits = 2000000000000L;
+ break;
+ }
+ case ACCEPT_REQ:
+ {
+ Accept accept = (Accept) context;
+ if (PRIORITY_MODEL == PHASE_HLC_FIFO || accept.ballot.equals(Ballot.ZERO))
+ txnId = accept.txnId;
+ priority_bits = 3000000000000L;
+ break;
+ }
+ case GET_EPHEMERAL_READ_DEPS_REQ:
+ case PRE_ACCEPT_REQ:
+ {
+ txnId = txnId0;
+ break;
+ }
+ }
+ }
+ }
+ break;
+ }
+ case HLC_FIFO:
+ {
+ txnId = task.preLoadContext().primaryTxnId();
+ break;
+ }
+ case FIFO:
+ {
+ break;
+ }
+ }
+
+ if (txnId != null)
+ {
+ long hlc = txnId.hlc();
+ long delta = nextPosition - hlc;
+ if (delta < AGE_TO_FIFO)
+ {
+ long position = hlc;
+ if (delta <= 0) nextPosition = position + 1;
+ else if (position < minPosition) position = minPosition;
+ position |= priority_bits;
+ task.queuePosition = position;
+ return;
+ }
+ }
+
+ assignFifoQueuePosition(task);
+ }
+
}
- private void assignNewQueuePosition(Task task)
+ private void assignMinQueuePosition(Task task)
{
- if (nextPosition == 0) nextPosition++;
- task.queuePosition = nextPosition++;
+ task.queuePosition = minPosition | PRIORITY_BITS;
+ }
+
+ private void assignFifoQueuePosition(Task task)
+ {
+ task.queuePosition = nextPosition++ | PRIORITY_BITS;
+ }
+
+ private void updateNextPosition(Task task)
+ {
+ nextPosition = Math.max(nextPosition, (task.queuePosition & ~PRIORITY_BITS) + 1);
}
private void inheritQueuePosition(Task parent, Task task)
@@ -698,18 +860,19 @@ void completeTaskExclusive(Task task)
// - first take the position so that represents the just-executed task
// - call cleanup to submit any following task on the relevant sub-queue
// - remove the previous task from the running collection only if still present (SequentialExecutor will have removed it)
- int position = task.queuePosition;
+ long position = task.queuePosition;
try
{
- task.cleanupExclusive();
+ task.cleanupExclusive(this);
}
finally
{
+ if (DEBUG_EXECUTION) task.debug.onCompleted(task, debug);
--tasks;
if (running.contains(task))
running.remove(task);
- if (waitingForCompletion != null && waitingForCompletion.peek().maybeNotify - position >= 0)
+ if (waitingForCompletion != null && waitingForCompletion.peek().maybeNotify <= position)
maybeNotifyWaitingForCompletion();
cache.tryShrinkOrEvict(lock);
@@ -718,12 +881,12 @@ void completeTaskExclusive(Task task)
private void maybeNotifyWaitingForCompletion()
{
- int min = minPosition(waitingToRun.peek(),
+ long min = minPosition(waitingToRun.peek(),
minPosition(waitingToLoad.peek(),
minPosition(waitingToLoadRangeTxns.peek(),
minPosition(running.peek(),
minPosition(loading.peek(),
- minPosition(scanningRanges.peek(), Integer.MAX_VALUE))))));
+ minPosition(scanningRanges.peek(), Long.MAX_VALUE))))));
while (!waitingForCompletion.isEmpty() && waitingForCompletion.peek().position - min <= 0)
waitingForCompletion.poll().run.run();
@@ -733,9 +896,9 @@ private void maybeNotifyWaitingForCompletion()
waitingForCompletion.peek().maybeNotify = min;
}
- private static int minPosition(@Nullable Task task, int min)
+ private static long minPosition(@Nullable Task task, long min)
{
- return task == null ? min : Integer.min(task.queuePosition, min);
+ return task == null ? min : Long.min(task.queuePosition, min);
}
void cancelExclusive(AccordTask> task)
@@ -746,7 +909,7 @@ void cancelExclusive(AccordTask> task)
case INITIALIZED:
// we could be cancelled before we even reach the queue
try { task.cancelExclusive(); }
- finally { task.cleanupExclusive(); }
+ finally { task.cleanupExclusive(this); }
break;
case SCANNING_RANGES:
@@ -869,7 +1032,7 @@ public void executeDirectlyWithLock(Runnable command)
}
finally
{
- beforeUnlock();
+ beforeUnlockExternal();
unlock();
}
}
@@ -947,24 +1110,31 @@ void clearRunning()
public static abstract class Task extends IntrusivePriorityHeap.Node
{
- int queuePosition;
- Thread assigned;
+ long queuePosition;
+ public long createdAt = nanoTime(), waitingToRunAt, runningAt, cleanupAt;
+ public final DebugTask debug = DebugTask.maybeDebug();
protected Task()
{
}
+ public void onWaitingToRun()
+ {
+ waitingToRunAt = nanoTime();
+ }
+
public DebuggableTask debuggable() { return null; }
/**
* Prepare to run while holding the state cache lock
*/
- abstract protected void preRunExclusive();
+ abstract protected void preRunExclusive(Thread assigned);
/**
* Run the command; the state cache lock may or may not be held depending on the executor implementation
*/
- abstract protected void runInternal();
+ protected abstract void runInternal();
+
/**
* Fail the command; the state cache lock may or may not be held depending on the executor implementation
*/
@@ -973,7 +1143,19 @@ protected Task()
/**
* Cleanup the command while holding the state cache lock
*/
- abstract protected void cleanupExclusive();
+ protected void cleanupExclusive(AccordExecutor executor)
+ {
+ cleanupAt = nanoTime();
+ if (runningAt != 0)
+ {
+ if (waitingToRunAt == 0)
+ waitingToRunAt = runningAt;
+ executor.elapsedWaitingToRun.increment(runningAt - waitingToRunAt, runningAt);
+ executor.elapsedPreparingToRun.increment(waitingToRunAt - createdAt, runningAt);
+ executor.elapsedRunning.increment(cleanupAt - runningAt, cleanupAt);
+ executor.elapsed.increment(cleanupAt - createdAt, cleanupAt);
+ }
+ }
void cancelExclusive(AccordExecutor owner) {}
@@ -1011,9 +1193,9 @@ static class SequentialQueueTask extends Task
}
@Override
- protected void preRunExclusive()
+ protected void preRunExclusive(Thread assigned)
{
- queue.preRunTask();
+ queue.preRunTask(assigned);
}
@Override
@@ -1029,7 +1211,7 @@ protected void fail(Throwable t)
}
@Override
- protected void cleanupExclusive()
+ protected void cleanupExclusive(AccordExecutor executor)
{
queue.cleanupTask();
}
@@ -1054,51 +1236,54 @@ public class SequentialExecutor extends TaskQueue implements SequentialAsy
private volatile boolean visibleStopped;
private boolean terminated;
- SequentialExecutor()
+ final DebugSequentialExecutor debug;
+
+ SequentialExecutor(AccordExecutor executor)
{
- this(-1);
+ this(executor, -1);
}
- SequentialExecutor(int commandStoreId)
+ SequentialExecutor(AccordExecutor executor, int commandStoreId)
{
- super(WAITING_TO_RUN);
+ super(WAITING_TO_RUN, commandStoreId < 0);
this.commandStoreId = commandStoreId;
this.selfTask = new SequentialQueueTask(this);
+ this.debug = DebugSequentialExecutor.maybeDebug(executor.debug, commandStoreId);
}
- void preRunTask()
+ void preRunTask(Thread assigned)
{
Invariants.require(task != null);
- assigned = Thread.currentThread();
- task.preRunExclusive();
+ this.assigned = assigned;
+ task.preRunExclusive(assigned);
}
void runTask()
{
- outer: while (!ownerUpdater.compareAndSet(this, null, assigned))
+ if (!ownerUpdater.compareAndSet(this, null, assigned))
{
+ if (DEBUG_EXECUTION) debug.onWaiting();
+ Invariants.require(assigned == Thread.currentThread());
waiting = assigned;
- while (true)
+ outer: do
{
- Thread owner = this.owner;
- if (owner == assigned) break outer;
- if (owner == null) continue outer;
- LockSupport.park();
+ while (true)
+ {
+ Thread owner = this.owner;
+ if (owner == assigned) break outer;
+ if (owner == null) continue outer;
+ LockSupport.park();
+ }
}
+ while (!ownerUpdater.compareAndSet(this, null, assigned));
}
waiting = null;
- try
- {
- if (stopped && reject(task))
- task.fail(new RejectedExecutionException(commandStoreId + " is terminated. Cannot execute " + ((AccordTask>) task).preLoadContext()));
- else
- task.runInternal();
- }
- finally
- {
- owner = null;
- }
+ if (stopped && reject(task))
+ task.fail(new RejectedExecutionException(commandStoreId + " is terminated. Cannot execute " + ((AccordTask>) task).preLoadContext()));
+ else
+ task.runInternal();
+ // NOTE: cannot safely release owner here, in case an immediate-execution runs before we can release our references and store their changes to the cache
}
private boolean reject(Task task)
@@ -1118,11 +1303,13 @@ void failTask(Throwable t)
void cleanupTask()
{
- try { task.cleanupExclusive(); }
+ try { task.cleanupExclusive(AccordExecutor.this); }
finally
{
assigned = null;
+ owner = null;
task = super.poll();
+ if (DEBUG_EXECUTION) debug.onSetTask(task);
// it should only be possible for this method to be invoked once we're on the running queue
AccordExecutor.this.running.remove(selfTask);
@@ -1149,6 +1336,7 @@ protected void append(Task newTask)
task = newTask;
selfTask.queuePosition = newTask.queuePosition;
waitingToRun.append(selfTask);
+ if (DEBUG_EXECUTION) debug.onSetTask(newTask);
}
}
@@ -1176,6 +1364,7 @@ private boolean removeCurrentTask(Node remove)
// but can for other tasks that don't track their own state
task = super.poll();
+ if (DEBUG_EXECUTION) debug.onSetTask(task);
if (waitingToRun.contains(selfTask))
{
if (task == null) waitingToRun.remove(selfTask);
@@ -1242,7 +1431,7 @@ protected boolean contains(Task contains)
@Override
public AsyncChain chain(Runnable run)
{
- int position = inheritQueuePosition();
+ long position = inheritQueuePosition();
return new AsyncChains.Head<>()
{
@Override
@@ -1256,7 +1445,7 @@ protected Cancellable start(BiConsumer super Void, Throwable> callback)
@Override
public AsyncChain chain(Callable call)
{
- int position = inheritQueuePosition();
+ long position = inheritQueuePosition();
return new AsyncChains.Head<>()
{
@Override
@@ -1270,7 +1459,7 @@ protected Cancellable start(BiConsumer super T, Throwable> callback)
@Override
public AsyncChain flatChain(Callable extends AsyncChain> call)
{
- int position = inheritQueuePosition();
+ long position = inheritQueuePosition();
return new AsyncChains.Head<>()
{
@Override
@@ -1287,12 +1476,12 @@ public Cancellable execute(RunOrFail runOrFail)
return execute(runOrFail, inheritQueuePosition());
}
- private int inheritQueuePosition()
+ private long inheritQueuePosition()
{
return inExecutor() && task != null ? task.queuePosition : 0;
}
- private Cancellable execute(RunOrFail runOrFail, int queuePosition)
+ private Cancellable execute(RunOrFail runOrFail, long queuePosition)
{
PlainChain submit = new PlainChain(runOrFail, SequentialExecutor.this, queuePosition);
return AccordExecutor.this.submit(submit);
@@ -1319,12 +1508,13 @@ public boolean tryExecuteImmediately(Runnable run)
{
if (owner == null)
{
- this.owner = null;
+ Thread waiting = this.waiting;
+ Invariants.require(waiting != self);
+ this.owner = waiting;
+ if (waiting == null) // recheck, to ensure happens-before relation with a new waiter that expects any non-null owner to notify it
+ waiting = this.waiting;
if (waiting != null)
- {
LockSupport.unpark(waiting);
- ownerUpdater.compareAndSet(this, null, waiting);
- }
}
}
return true;
@@ -1340,6 +1530,12 @@ static class TaskQueue extends IntrusivePriorityHeap
this.kind = kind;
}
+ TaskQueue(AccordTask.State kind, boolean tiny)
+ {
+ super(tiny);
+ this.kind = kind;
+ }
+
@Override
public int compare(T o1, T o2)
{
@@ -1421,10 +1617,7 @@ abstract class Plain extends SubmittableTask implements Cancellable
abstract SequentialExecutor executor();
@Override
- protected void preRunExclusive() {}
-
- @Override
- protected void cleanupExclusive() {}
+ protected void preRunExclusive(Thread assigned) {}
@Override
protected final void addToQueue(TaskQueue queue)
@@ -1475,7 +1668,7 @@ class PlainRunnable extends Plain implements Cancellable
this(result, run, null, 0);
}
- PlainRunnable(AsyncPromise result, Runnable run, @Nullable SequentialExecutor executor, int queuePosition)
+ PlainRunnable(AsyncPromise result, Runnable run, @Nullable SequentialExecutor executor, long queuePosition)
{
this.result = result;
this.run = run;
@@ -1486,12 +1679,14 @@ class PlainRunnable extends Plain implements Cancellable
@Override
protected void runInternal()
{
+ runningAt = nanoTime();
try (Closeable close = locals.get())
{
run.run();
}
if (result != null)
result.trySuccess(null);
+ if (DEBUG_EXECUTION) debug.onRunComplete();
}
@Override
@@ -1518,14 +1713,15 @@ abstract class IOTask extends Plain implements Cancellable, DebuggableTask
abstract void postRunExclusive();
@Override
- protected void preRunExclusive()
+ protected void preRunExclusive(Thread assigned)
{
startedAtNanos = MonotonicClock.Global.approxTime.now();
}
@Override
- protected void cleanupExclusive()
+ protected void cleanupExclusive(AccordExecutor executor)
{
+ super.cleanupExclusive(executor);
postRunExclusive();
}
@@ -1586,10 +1782,12 @@ void postRunExclusive()
@Override
public void runInternal()
{
+ runningAt = nanoTime();
try (Closeable close = locals.get())
{
result = entry.owner.parent().adapter().load(entry.owner.commandStore, entry.key());
}
+ if (DEBUG_EXECUTION) debug.onRunComplete();
}
@Override
@@ -1631,10 +1829,12 @@ class WrappedIOTask extends IOTask
@Override
protected void runInternal()
{
+ runningAt = nanoTime();
try (Closeable close = locals.get())
{
wrapped.runInternal();
}
+ if (DEBUG_EXECUTION) debug.onRunComplete();
}
@Override
@@ -1680,10 +1880,12 @@ void postRunExclusive()
@Override
public void runInternal()
{
+ runningAt = nanoTime();
try (Closeable close = locals.get())
{
run.run();
}
+ if (DEBUG_EXECUTION) debug.onRunComplete();
failure = null;
}
@@ -1710,7 +1912,7 @@ class PlainChain extends Plain
this(runOrFail, null, 0);
}
- PlainChain(RunOrFail runOrFail, @Nullable SequentialExecutor executor, int queuePosition)
+ PlainChain(RunOrFail runOrFail, @Nullable SequentialExecutor executor, long queuePosition)
{
this.runOrFail = runOrFail;
this.executor = executor;
@@ -1726,6 +1928,7 @@ SequentialExecutor executor()
@Override
protected void runInternal()
{
+ runningAt = nanoTime();
try (Closeable close = locals.get())
{
runOrFail.run();
@@ -1735,6 +1938,7 @@ protected void runInternal()
// shouldn't throw exceptions
agent.onException(t);
}
+ if (DEBUG_EXECUTION) debug.onRunComplete();
}
@Override
@@ -1778,16 +1982,11 @@ public long startTimeNanos()
}
@Override
- protected void preRunExclusive()
+ protected void preRunExclusive(Thread assigned)
{
startedAtNanos = MonotonicClock.Global.approxTime.now();
}
- @Override
- protected void cleanupExclusive()
- {
- }
-
@Override
public String description()
{
@@ -1829,7 +2028,7 @@ public Integer commandStoreId()
return commandStoreId >= 0 ? commandStoreId : null;
}
- public int position()
+ public long position()
{
return task.queuePosition;
}
@@ -1858,7 +2057,7 @@ public int position()
public int compareTo(TaskInfo that)
{
int c = this.status.compareTo(that.status);
- if (c == 0) c = Integer.compare(this.position(), that.position());
+ if (c == 0) c = Long.compare(this.position(), that.position());
return c;
}
}
@@ -1918,4 +2117,5 @@ public int unsafeRunningCount()
{
return running.size();
}
+
}
diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractLockLoop.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractLockLoop.java
index 768ddebc7578..7345b7de2bb9 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractLockLoop.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractLockLoop.java
@@ -26,15 +26,18 @@
import accord.utils.QuintConsumer;
import org.apache.cassandra.concurrent.DebuggableTask.DebuggableTaskRunner;
+import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.service.accord.AccordExecutorLoops.LoopTask;
+import org.apache.cassandra.service.accord.debug.DebugExecution.DebugExecutorLoop;
import org.apache.cassandra.utils.concurrent.ConcurrentLinkedStack;
import static org.apache.cassandra.service.accord.AccordExecutor.Mode.RUN_WITH_LOCK;
+import static org.apache.cassandra.service.accord.debug.DebugExecution.DEBUG_EXECUTION;
abstract class AccordExecutorAbstractLockLoop extends AccordExecutor
{
+ private static final int YIELD_INTERVAL = DatabaseDescriptor.getAccord().queue_yield_interval;
final ConcurrentLinkedStack submitted = new ConcurrentLinkedStack<>();
- boolean isHeldByExecutor;
boolean shutdown;
AccordExecutorAbstractLockLoop(Lock lock, int executorId, Agent agent)
@@ -44,9 +47,9 @@ abstract class AccordExecutorAbstractLockLoop extends AccordExecutor
abstract void notifyWork();
abstract void notifyWorkExclusive();
+ void loopYieldExclusive() throws InterruptedException {}
abstract void awaitExclusive() throws InterruptedException;
abstract AccordExecutorLoops loops();
- abstract boolean isInLoop();
abstract void submitExternal(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4);
void submit(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4)
@@ -57,7 +60,7 @@ void submit(QuintConsumer void submitExternalExclusive(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4)
+ void submitExternalExclusive(QuintConsumer sync, P1s p1s, P2 p2, P3 p3, P4 p4)
{
try
{
@@ -95,50 +98,54 @@ public boolean hasTasks()
}
}
- @Override
- void beforeUnlock()
- {
- if (!isInLoop())
- notifyIfMoreWorkExclusive();
- }
-
- void updateWaitingToRunExclusive()
+ final void updateWaitingToRunExclusive()
{
drainSubmittedExclusive();
super.updateWaitingToRunExclusive();
}
- void drainSubmittedExclusive()
+ final void drainSubmittedExclusive()
{
submitted.drain(AccordExecutor::consumeExclusive, this, true);
}
- void notifyIfMoreWorkExclusive()
+ final void notifyIfMoreWorkExclusive()
{
if (hasWaitingToRun())
notifyWorkExclusive();
}
- private void enterLockExclusive()
+ @Override
+ final void beforeUnlockExternal()
+ {
+ beforeUnlockLoop();
+ }
+
+ final void beforeUnlockLoop()
{
- isHeldByExecutor = true;
+ notifyIfMoreWorkExclusive();
}
- private void exitLockExclusive()
+ private void enterLockLoop()
{
+ resumeLoop();
+ }
+
+ private void exitLockLoop()
+ {
+ pauseLoop();
notifyIfMoreWorkExclusive();
}
- private void pauseExclusive()
+ final void pauseLoop()
{
- isHeldByExecutor = false;
if (--runningThreads == 0 && tasks == 0)
notifyQuiescentExclusive();
}
- private void resumeExclusive()
+ final void resumeLoop()
{
- isHeldByExecutor = true;
+ if (DEBUG_EXECUTION) debug.onEnterLock();
++runningThreads;
}
@@ -154,14 +161,14 @@ protected LoopTask runWithLock(String name)
@Override
public void run()
{
+ Thread self = Thread.currentThread();
Task task;
while (true)
{
lock();
try
{
- resumeExclusive();
- enterLockExclusive();
+ enterLockLoop();
while (true)
{
task = pollWaitingToRunExclusive();
@@ -171,7 +178,7 @@ public void run()
setRunning(task);
try
{
- task.preRunExclusive();
+ task.preRunExclusive(self);
task.runInternal();
}
catch (Throwable t)
@@ -188,22 +195,21 @@ public void run()
{
if (shutdown)
{
- pauseExclusive();
- exitLockExclusive();
+ pauseLoop();
+ exitLockLoop();
notifyWorkExclusive(); // always notify on shutdown
return;
}
- pauseExclusive();
+ pauseLoop();
awaitExclusive();
- resumeExclusive();
+ resumeLoop();
}
}
}
catch (Throwable t)
{
- pauseExclusive();
- exitLockExclusive();
+ exitLockLoop();
try { agent.onException(t); }
catch (Throwable t2) { }
@@ -221,15 +227,21 @@ protected LoopTask runWithoutLock(String name)
{
return new LoopTask(name)
{
+ final DebugExecutorLoop debug = DEBUG_EXECUTION ? new DebugExecutorLoop(AccordExecutorAbstractLockLoop.this.debug) : null;
@Override
public void run()
{
+ Thread self = Thread.currentThread();
+ int count = 0;
Task task = null;
while (true)
{
+ if (DEBUG_EXECUTION) debug.onLock();
lock();
try
{
+ if (DEBUG_EXECUTION) debug.onEnterLock();
+ enterLockLoop();
if (task != null)
{
Task tmp = task;
@@ -237,31 +249,40 @@ public void run()
completeTaskExclusive(tmp);
clearRunning();
}
- else resumeExclusive();
- enterLockExclusive();
+
+ if (count >= YIELD_INTERVAL)
+ {
+ loopYieldExclusive();
+ count = 0;
+ }
while (true)
{
task = pollWaitingToRunExclusive();
+
if (task != null)
{
setRunning(task);
- task.preRunExclusive();
- exitLockExclusive();
+ task.preRunExclusive(self);
+ if (DEBUG_EXECUTION) debug.onExitLock();
+ exitLockLoop();
break;
}
- pauseExclusive();
-
if (shutdown)
{
- exitLockExclusive();
+ if (DEBUG_EXECUTION) debug.onExitLock();
+ exitLockLoop();
notifyWorkExclusive();
return;
}
+ pauseLoop();
+ if (DEBUG_EXECUTION) debug.onExitLock();
awaitExclusive();
- resumeExclusive();
+ if (DEBUG_EXECUTION) debug.onEnterLock();
+ resumeLoop();
+ count = 0;
}
}
catch (Throwable t)
@@ -281,18 +302,18 @@ public void run()
try { agent.onException(t); }
catch (Throwable t2) { /* nothing we can sensibly do after already reporting */ }
}
- if (isHeldByExecutor)
- pauseExclusive();
- exitLockExclusive();
+ exitLockLoop();
continue;
}
finally
{
+ if (DEBUG_EXECUTION) debug.onExitLock();
unlock();
}
try
{
+ ++count;
task.runInternal();
}
catch (Throwable t)
diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractSemiSyncSubmit.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractSemiSyncSubmit.java
index ccba5f7085be..f10caab42f56 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractSemiSyncSubmit.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorAbstractSemiSyncSubmit.java
@@ -35,20 +35,7 @@ abstract class AccordExecutorAbstractSemiSyncSubmit extends AccordExecutorAbstra
void submitExternal(QuintConsumer sync, QuadFunction async, P1s p1s, P1a p1a, P2 p2, P3 p3, P4 p4)
{
- if (!tryLock())
- {
- submitted.push(async.apply(p1a, p2, p3, p4));
+ if (submitted.push(async.apply(p1a, p2, p3, p4)) && !isInLoop())
notifyWork();
- return;
- }
-
- try
- {
- submitExternalExclusive(sync, async, p1s, p1a, p2, p3, p4);
- }
- finally
- {
- unlock();
- }
}
}
diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorSemiSyncSubmit.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorSemiSyncSubmit.java
index 3ccbb5933a1a..f3e694c1c97b 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordExecutorSemiSyncSubmit.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorSemiSyncSubmit.java
@@ -31,6 +31,7 @@ class AccordExecutorSemiSyncSubmit extends AccordExecutorAbstractSemiSyncSubmit
private final AccordExecutorLoops loops;
private final ReentrantLock lock;
private final Condition hasWork;
+ private int waiting;
public AccordExecutorSemiSyncSubmit(int executorId, Mode mode, int threads, IntFunction name, Agent agent)
{
@@ -45,11 +46,30 @@ private AccordExecutorSemiSyncSubmit(ReentrantLock lock, int executorId, Mode mo
this.loops = new AccordExecutorLoops(mode, threads, name, this::task);
}
+ @Override
+ void loopYieldExclusive() throws InterruptedException
+ {
+ if (waiting > 0 && hasWaitingToRun())
+ {
+ pauseLoop();
+ hasWork.signal();
+ awaitWork();
+ resumeLoop();
+ }
+ }
+
@Override
void awaitExclusive() throws InterruptedException
{
if (submitted.isEmpty())
- hasWork.await();
+ awaitWork();
+ }
+
+ private void awaitWork() throws InterruptedException
+ {
+ waiting++;
+ try { hasWork.await(); }
+ finally { waiting--; }
}
@Override
@@ -67,19 +87,9 @@ boolean isInLoop()
@Override
void notifyWork()
{
- // we check running both sides of tryLock for ordering guarantees
- boolean hadRunning = isHeldByExecutor;
- if (lock.tryLock())
- {
- try { hasWork.signal(); }
- finally { lock.unlock(); }
- }
- else if (!hadRunning || !isHeldByExecutor)
- {
- lock.lock();
- try { hasWork.signal(); }
- finally { lock.unlock(); }
- }
+ lock.lock();
+ try { hasWork.signal(); }
+ finally { lock.unlock(); }
}
@Override
diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorSimple.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorSimple.java
index c7842e4666d5..349c9be62052 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordExecutorSimple.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorSimple.java
@@ -62,6 +62,12 @@ public AccordExecutorSimple(ReentrantLock lock, int executorId, Mode mode, int t
this.executor = executorFactory().sequential(name.apply(0));
}
+ @Override
+ boolean isInLoop()
+ {
+ return executor.inExecutor();
+ }
+
@Override
public boolean hasTasks()
{
@@ -69,7 +75,7 @@ public boolean hasTasks()
}
@Override
- void beforeUnlock()
+ void beforeUnlockExternal()
{
if (hasWaitingToRun())
executor.execute(this::run);
@@ -77,6 +83,7 @@ void beforeUnlock()
protected void run()
{
+ Thread self = Thread.currentThread();
lock.lock();
try
{
@@ -92,7 +99,7 @@ protected void run()
return;
}
- try { task.preRunExclusive(); task.runInternal(); }
+ try { task.preRunExclusive(self); task.runInternal(); }
catch (Throwable t) { task.fail(t); }
finally
{
diff --git a/src/java/org/apache/cassandra/service/accord/AccordExecutorSyncSubmit.java b/src/java/org/apache/cassandra/service/accord/AccordExecutorSyncSubmit.java
index 8243e81e523e..6547048f67d0 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordExecutorSyncSubmit.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordExecutorSyncSubmit.java
@@ -79,14 +79,8 @@ boolean isOwningThread()
void notifyWork()
{
lock.lock();
- try
- {
- hasWork.signal();
- }
- finally
- {
- lock.unlock();
- }
+ try { hasWork.signal(); }
+ finally { lock.unlock(); }
}
@Override
@@ -100,7 +94,7 @@ void submitExternal(QuintConsumer mapping = builder.build();
StandardMessage.initialise(mapping);
@@ -209,6 +214,7 @@ public void send(Node.Id to, Request request)
{
Verb verb = VerbMapping.getVerb(request);
Preconditions.checkNotNull(verb, "Verb is null for type %s", request.type());
+
Message message = Message.out(verb, request);
InetAddressAndPort endpoint = endpointMapper.mappedEndpointOrNull(to, message);
if (endpoint == null)
@@ -224,7 +230,7 @@ public Cancellable send(Node.Id to, Request request, int attempt, AsyncExecutor
Verb verb = VerbMapping.getVerb(request);
Preconditions.checkNotNull(verb, "Verb is null for type %s", request.type());
- long nowNanos = Clock.Global.nanoTime();
+ long nowNanos = nanoTime();
TxnId txnId = request.primaryTxnId();
long slowAtNanos = Long.MAX_VALUE;
long expiresAtNanos = nowNanos + expire(txnId, verb).computeWait(attempt, NANOSECONDS);
@@ -251,6 +257,13 @@ public Cancellable send(Node.Id to, Request request, int attempt, AsyncExecutor
}
Message message = Message.out(verb, request, expiresAtNanos);
+ if (request instanceof MapReduceCommandStores, ?>)
+ {
+ Tracing tracing = ((MapReduceCommandStores, ?>) request).tracing();
+ if (tracing != null) tracing = tracing.send();
+ if (tracing != null) message = message.withParam(ParamType.ACCORD_TRACING, tracing);
+ }
+
InetAddressAndPort endpoint = endpointMapper.mappedEndpointOrNull(to, message);
if (endpoint == null)
{
@@ -263,26 +276,24 @@ public Cancellable send(Node.Id to, Request request, int attempt, AsyncExecutor
return cancellable;
}
- @Override
- public void reply(Node.Id replyingTo, ReplyContext replyContext, Reply reply)
+ public void reply(Node.Id replyingTo, ReplyContext replyContext, Reply reply, Throwable failure)
{
ResponseContext respondTo = (ResponseContext) replyContext;
- Message> message = Message.responseWith(reply, respondTo);
- if (!reply.isFinal())
- message = message.withFlag(MessageFlag.NOT_FINAL);
- checkReplyType(reply, respondTo);
- InetAddressAndPort endpoint = endpointMapper.mappedEndpointOrNull(replyingTo, message);
- if (endpoint == null)
- return;
-
- messaging.send(message, endpoint);
- }
+ Message> message;
+ if (failure != null) message = Message.failureResponse(RequestFailureReason.UNKNOWN, failure, respondTo);
+ else
+ {
+ message = Message.responseWith(reply, respondTo);
+ if (Invariants.isParanoid()) checkReplyType(reply, respondTo);
+ }
+ Object tracing = respondTo.params().get(ParamType.ACCORD_TRACING);
+ if (tracing != null)
+ {
+ tracing = ((Tracing)tracing).send();
+ if (tracing != null)
+ message = message.withParam(ParamType.ACCORD_TRACING, tracing);
+ }
- @Override
- public void replyWithUnknownFailure(Node.Id replyingTo, ReplyContext replyContext, Throwable failure)
- {
- ResponseContext respondTo = (ResponseContext) replyContext;
- Message> message = Message.failureResponse(RequestFailureReason.UNKNOWN, failure, respondTo);
InetAddressAndPort endpoint = endpointMapper.mappedEndpointOrNull(replyingTo, message);
if (endpoint == null)
return;
diff --git a/src/java/org/apache/cassandra/service/accord/AccordObjectSizes.java b/src/java/org/apache/cassandra/service/accord/AccordObjectSizes.java
index 8e282873d176..8f6f374986d8 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordObjectSizes.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordObjectSizes.java
@@ -25,7 +25,7 @@
import accord.api.Result;
import accord.api.RoutingKey;
import accord.local.Command;
-import accord.local.ICommand;
+import accord.local.CommandBuilder;
import accord.local.Node;
import accord.local.StoreParticipants;
import accord.local.cfk.CommandsForKey;
@@ -39,11 +39,9 @@
import accord.primitives.FullRangeRoute;
import accord.primitives.KeyDeps;
import accord.primitives.Keys;
-import accord.primitives.PartialDeps;
import accord.primitives.PartialKeyRoute;
import accord.primitives.PartialRangeRoute;
import accord.primitives.PartialTxn;
-import accord.primitives.Participants;
import accord.primitives.Range;
import accord.primitives.RangeDeps;
import accord.primitives.Ranges;
@@ -61,6 +59,7 @@
import accord.utils.UnhandledEnum;
import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.BufferDecoratedKey;
import org.apache.cassandra.dht.Murmur3Partitioner;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.service.accord.api.PartitionKey;
@@ -73,18 +72,15 @@
import org.apache.cassandra.service.accord.txn.TxnRead;
import org.apache.cassandra.service.accord.txn.TxnResult;
import org.apache.cassandra.service.accord.txn.TxnWrite;
+import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.ObjectSizes;
-import static accord.local.Command.Accepted.accepted;
-import static accord.local.Command.Committed.committed;
-import static accord.local.Command.Executed.executed;
-import static accord.local.Command.NotAcceptedWithoutDefinition.notAccepted;
-import static accord.local.Command.NotDefined.notDefined;
-import static accord.local.Command.PreAccepted.preaccepted;
import static accord.local.Command.Truncated.WaitingOn;
-import static accord.local.Command.Truncated.invalidated;
-import static accord.local.Command.Truncated.vestigial;
import static accord.local.cfk.CommandsForKey.InternalStatus.ACCEPTED;
+import static accord.primitives.SaveStatus.Invalidated;
+import static accord.primitives.SaveStatus.NotDefined;
+import static accord.primitives.SaveStatus.PreAccepted;
+import static accord.primitives.SaveStatus.TruncatedUnapplied;
import static accord.primitives.Status.Durability.NotDurable;
import static accord.primitives.TxnId.NO_TXNIDS;
import static org.apache.cassandra.utils.ObjectSizes.measure;
@@ -306,23 +302,24 @@ public static long results(Result result)
private static class CommandEmptySizes
{
- private final static TokenKey EMPTY_KEY = new TokenKey(EMPTY_ID, null);
+ private final static PartitionKey EMPTY_KEY = new PartitionKey(EMPTY_ID, new BufferDecoratedKey(new Murmur3Partitioner.LongToken(1), ByteBufferUtil.EMPTY_BYTE_BUFFER));
+ private final static TokenKey EMPTY_TOKEN_KEY = new TokenKey(EMPTY_ID, new Murmur3Partitioner.LongToken(1));
private final static TxnId EMPTY_TXNID = new TxnId(42, 42, 0, Kind.Read, Domain.Key, new Node.Id(42));
- private static ICommand attrs(boolean hasDeps, boolean hasTxn, boolean executes)
+ private static Command build(SaveStatus saveStatus, boolean hasDeps, boolean hasTxn, boolean executes)
{
- FullKeyRoute route = new FullKeyRoute(EMPTY_KEY, new RoutingKey[]{ EMPTY_KEY });
- Participants> empty = route.slice(0, 0);
- ICommand.Builder builder = new ICommand.Builder(EMPTY_TXNID)
- .setParticipants(StoreParticipants.create(route, empty, executes ? empty : null, executes ? empty : null, empty, route))
+ Keys keys = Keys.of(EMPTY_KEY);
+ FullKeyRoute route = new FullKeyRoute(EMPTY_TOKEN_KEY, new RoutingKey[]{ EMPTY_TOKEN_KEY });
+ CommandBuilder builder = new CommandBuilder(EMPTY_TXNID)
+ .participants(StoreParticipants.create(route, route, executes ? route : null, executes ? route : null, route, route))
.durability(NotDurable)
.executeAt(EMPTY_TXNID)
.promised(Ballot.ZERO);
if (hasDeps)
- builder.partialDeps(PartialDeps.NONE);
+ builder.partialDeps(new Deps(KeyDeps.none(route.toParticipants()), RangeDeps.NONE).intersecting(route));
if (hasTxn)
- builder.partialTxn(new PartialTxn.InMemory(Kind.Read, null, null, null, null, TableMetadatasAndKeys.none(Domain.Key)));
+ builder.partialTxn(new PartialTxn.InMemory(Kind.Read, keys, TxnRead.empty(Domain.Key), null, null, TableMetadatasAndKeys.none(Domain.Key)));
if (executes)
{
@@ -330,18 +327,20 @@ private static ICommand attrs(boolean hasDeps, boolean hasTxn, boolean executes)
builder.result(new TxnData());
}
- return builder;
+ return builder.build(saveStatus);
}
- final static long NOT_DEFINED = measure(notDefined(attrs(false, false, false)));
- final static long PREACCEPTED = measure(preaccepted(attrs(false, true, false), SaveStatus.PreAccepted));
- final static long NOTACCEPTED = measure(notAccepted(attrs(false, false, false), SaveStatus.AcceptedInvalidate));
- final static long ACCEPTED = measure(accepted(attrs(true, false, false), SaveStatus.AcceptedMedium));
- final static long COMMITTED = measure(committed(attrs(true, true, false), SaveStatus.Committed));
- final static long EXECUTED = measure(executed(attrs(true, true, true), SaveStatus.Applied));
+ final static long NOT_DEFINED = measure(build(NotDefined, false, false, false));
+ final static long PREACCEPTED = measure(build(PreAccepted, false, true, false));
+ final static long NOTACCEPTED = measure(build(SaveStatus.AcceptedInvalidate, false, false, false));
+ final static long ACCEPTED = measure(build(SaveStatus.AcceptedMedium, true, false, false));
+ final static long COMMITTED = measure(build(SaveStatus.Committed, true, true, false));
+ final static long EXECUTED = measure(build(SaveStatus.Applied, true, true, true));
// TODO (expected): TruncatedAwaitsOnlyDeps
- final static long TRUNCATED = measure(vestigial(EMPTY_TXNID, attrs(false, false, false).participants()));
- final static long INVALIDATED = measure(invalidated(EMPTY_TXNID, attrs(false, false, false).participants()));
+ final static long TRUNCATED = measure(build(TruncatedUnapplied, false, false, false).participants());
+ final static long INVALIDATED = measure(build(Invalidated, false, false, false).participants());
+
+ private static void touch() {}
private static long emptySize(Command command)
{
diff --git a/src/java/org/apache/cassandra/service/accord/AccordSafeCommandStore.java b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandStore.java
index 8800ea3cf085..4626a8c6bb09 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordSafeCommandStore.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandStore.java
@@ -81,11 +81,11 @@ public CommandStores.RangesForEpoch ranges()
return commandStore.unsafeGetRangesForEpoch();
}
- public static AccordSafeCommandStore create(AccordTask> operation,
+ public static AccordSafeCommandStore create(AccordTask> task,
@Nullable CommandSummaries commandsForRanges,
AccordCommandStore commandStore)
{
- return new AccordSafeCommandStore(operation, commandsForRanges, commandStore);
+ return new AccordSafeCommandStore(task, commandsForRanges, commandStore);
}
@VisibleForTesting
diff --git a/src/java/org/apache/cassandra/service/accord/AccordSafeCommandsForKey.java b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandsForKey.java
index 3d34d463dca4..74201823b10b 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordSafeCommandsForKey.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordSafeCommandsForKey.java
@@ -24,10 +24,21 @@
import accord.api.RoutingKey;
import accord.local.cfk.CommandsForKey;
+import accord.local.cfk.NotifySink;
import accord.local.cfk.SafeCommandsForKey;
public class AccordSafeCommandsForKey extends SafeCommandsForKey implements AccordSafeState
{
+ public static class CommandsForKeyCacheEntry extends AccordCacheEntry
+ {
+ private NotifySink overrideSink;
+
+ CommandsForKeyCacheEntry(RoutingKey key, AccordCache.Type.Instance owner)
+ {
+ super(key, owner);
+ }
+ }
+
private boolean invalidated;
private final AccordCacheEntry global;
private CommandsForKey original;
@@ -39,6 +50,8 @@ public AccordSafeCommandsForKey(AccordCacheEntry glo
this.global = global;
this.original = null;
this.current = null;
+// if (overrideSink() == null)
+// overrideSink(new RecordingNotifySink());
}
@Override
@@ -103,6 +116,18 @@ public void set(CommandsForKey cfk)
this.current = cfk;
}
+ @Override
+ public void overrideSink(NotifySink overrideSink)
+ {
+ ((CommandsForKeyCacheEntry)global).overrideSink = overrideSink;
+ }
+
+ @Override
+ public NotifySink overrideSink()
+ {
+ return ((CommandsForKeyCacheEntry)global).overrideSink;
+ }
+
public CommandsForKey original()
{
checkNotInvalidated();
diff --git a/src/java/org/apache/cassandra/service/accord/AccordService.java b/src/java/org/apache/cassandra/service/accord/AccordService.java
index 2f0d575b31c5..60399ee27c7d 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordService.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordService.java
@@ -32,6 +32,7 @@
import java.util.function.BiConsumer;
import java.util.function.BiFunction;
import java.util.function.Function;
+import java.util.stream.Stream;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
@@ -72,6 +73,8 @@
import accord.primitives.Timestamp;
import accord.primitives.Txn;
import accord.primitives.TxnId;
+import accord.primitives.TxnId.FastPath;
+import accord.primitives.TxnId.FastPaths;
import accord.topology.ActiveEpochs;
import accord.topology.EpochReady;
import accord.topology.Shard;
@@ -87,9 +90,9 @@
import accord.utils.async.AsyncResults;
import org.apache.cassandra.concurrent.Shutdownable;
-import org.apache.cassandra.config.AccordSpec;
-import org.apache.cassandra.config.AccordSpec.CatchupMode;
-import org.apache.cassandra.config.AccordSpec.JournalSpec.ReplaySavePoint;
+import org.apache.cassandra.config.AccordConfig;
+import org.apache.cassandra.config.AccordConfig.CatchupMode;
+import org.apache.cassandra.config.AccordConfig.JournalConfig.ReplaySavePoint;
import org.apache.cassandra.config.CassandraRelevantProperties;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
@@ -158,23 +161,24 @@
import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException;
import static accord.api.Journal.TopologyUpdate;
-import static accord.api.ProtocolModifiers.Toggles.FastExec.MAY_BYPASS_SAFESTORE;
+import static accord.api.ProtocolModifiers.FastExecution.MAY_BYPASS_SAFESTORE;
import static accord.impl.progresslog.DefaultProgressLog.ModeFlag.CATCH_UP;
import static accord.local.durability.DurabilityService.SyncLocal.Self;
import static accord.local.durability.DurabilityService.SyncRemote.All;
import static accord.messages.SimpleReply.Ok;
import static accord.primitives.Txn.Kind.ExclusiveSyncPoint;
import static accord.primitives.Txn.Kind.Write;
+import static accord.primitives.TxnId.MediumPath.NoMediumPath;
+import static accord.primitives.TxnId.MediumPath.TrackStable;
import static java.util.concurrent.TimeUnit.MINUTES;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory;
import static org.apache.cassandra.concurrent.ExecutorFactory.SimulatorThreadTag.JOB;
import static org.apache.cassandra.concurrent.ExecutorFactory.SystemThreadTag.DAEMON;
-import static org.apache.cassandra.config.AccordSpec.CatchupMode.DISABLED;
-import static org.apache.cassandra.config.AccordSpec.CatchupMode.FALLBACK_TO_HARD;
-import static org.apache.cassandra.config.AccordSpec.CatchupMode.HARD;
-import static org.apache.cassandra.config.AccordSpec.JournalSpec.ReplayMode.RESET;
+import static org.apache.cassandra.config.AccordConfig.CatchupMode.FALLBACK_TO_HARD;
+import static org.apache.cassandra.config.AccordConfig.CatchupMode.HARD;
+import static org.apache.cassandra.config.AccordConfig.JournalConfig.ReplayMode.RESET;
import static org.apache.cassandra.config.DatabaseDescriptor.getAccord;
import static org.apache.cassandra.config.DatabaseDescriptor.getAccordCommandStoreShardCount;
import static org.apache.cassandra.config.DatabaseDescriptor.getAccordGlobalDurabilityCycle;
@@ -276,12 +280,10 @@ public void unsafeResetForTesting(ClusterMetadata metadata)
private static final Future EPOCH_READY = ImmediateFuture.success(null);
static
{
- ProtocolModifiers.Toggles.setPermitLocalExecution(true);
- ProtocolModifiers.Toggles.setRequiresUniqueHlcs(true);
- ProtocolModifiers.Toggles.setFastReadExecMayResendTxn(true);
- ProtocolModifiers.Toggles.setFastReadExec(MAY_BYPASS_SAFESTORE);
- ProtocolModifiers.Toggles.setFastWriteExec(MAY_BYPASS_SAFESTORE);
- ProtocolModifiers.Toggles.setDataStoreDetectsFutureReads(true);
+ ProtocolModifiers.Configure.setDataStoreRequiresUniqueHlcs(true);
+ ProtocolModifiers.Configure.setDataStoreDetectsFutureReads(true);
+ ProtocolModifiers.Configure.setFastReadExecution(MAY_BYPASS_SAFESTORE);
+ ProtocolModifiers.Configure.setFastWriteExecution(MAY_BYPASS_SAFESTORE);
}
private enum State { INIT, STARTING, STARTED, STOPPED, SHUTTING_DOWN, SHUTDOWN }
@@ -322,6 +324,10 @@ public static void unsafeSetNoop()
unsafeInstance = instance = requestInstance = replyInstance = NOOP_SERVICE;
}
+ public static void touch()
+ {
+ }
+
public static IAccordService tryGetUnsafe()
{
return unsafeInstance;
@@ -448,14 +454,14 @@ public AccordService(Id localId)
AccordAgent agent = FBUtilities.construct(CassandraRelevantProperties.ACCORD_AGENT_CLASS.getString(AccordAgent.class.getName()), "AccordAgent");
agent.setup(localId);
AccordTimeService time = new AccordTimeService();
- final RequestCallbacks callbacks = new RequestCallbacks(time);
this.scheduler = new AccordScheduler();
+ final RequestCallbacks callbacks = new RequestCallbacks(time, scheduler);
this.dataStore = new AccordDataStore();
this.journal = new AccordJournal(DatabaseDescriptor.getAccord().journal);
this.endpointMapper = new EndpointMapping.Updateable();
+ this.messageSink = new AccordMessageSink(endpointMapper, callbacks);
this.topologyService = new AccordTopologyService(localId, endpointMapper);
this.fastPathCoordinator = AccordFastPathCoordinator.create(localId, endpointMapper);
- this.messageSink = new AccordMessageSink(endpointMapper, callbacks);
this.node = new Node(localId,
messageSink,
topologyService,
@@ -479,6 +485,34 @@ time, new AtomicUniqueTimeWithStaleReservation(time),
this.responseHandler = new AccordResponseVerbHandler<>(callbacks, endpointMapper);
}
+ public static void applyProtocolModifiers(AccordConfig config)
+ {
+ if (config.coordinator_backlog_execution != null)
+ ProtocolModifiers.Configure.setCoordinatorBacklogExecution(config.coordinator_backlog_execution);
+ if (config.permit_fast_path != null)
+ ProtocolModifiers.Configure.setPermittedFastPaths(new FastPaths(Stream.of(FastPath.values()).filter(fp -> fp.compareTo(config.permit_fast_path) <= 0).toArray(FastPath[]::new)));
+ if (config.permit_track_stable_medium_path != null)
+ ProtocolModifiers.Configure.setDefaultMediumPath(config.permit_track_stable_medium_path ? TrackStable : NoMediumPath);
+ if (config.permit_fast_quorum_medium_path != null)
+ ProtocolModifiers.Configure.setPermitFastQuorumMediumPath(config.permit_fast_quorum_medium_path);
+ if (config.always_inform_durable_single_key != null)
+ ProtocolModifiers.Configure.setInformOfSingleKeyDurabilityIfDepsSizeAtLeast(0);
+ if (config.replica_execution != null)
+ ProtocolModifiers.Configure.setReplicaExecution(config.replica_execution);
+ if (config.replica_execution_distributed_persist_chance != null)
+ ProtocolModifiers.Configure.setReplicaExecuteDistributedPersistChance(config.replica_execution_distributed_persist_chance);
+ if (config.fast_read_execution != null)
+ ProtocolModifiers.Configure.setFastReadExecution(config.fast_read_execution);
+ if (config.fast_write_execution != null)
+ ProtocolModifiers.Configure.setFastWriteExecution(config.fast_write_execution);
+ if (config.clean_cfk_before != null)
+ ProtocolModifiers.Configure.setCleanCfkBefore(config.clean_cfk_before);
+ if (config.send_stable != null)
+ ProtocolModifiers.Configure.setSendStableMessages(config.send_stable);
+ if (config.send_minimal != null)
+ ProtocolModifiers.Configure.setSendMinimalCommits(config.send_minimal);
+ }
+
@Override
public synchronized void localStartup()
{
@@ -497,6 +531,7 @@ public synchronized void localStartup()
case EXIT:
throw new RuntimeException("Stop marker is older than start marker (" + stopMarker + '<' + startMarker + ") , so cannot assume we have a complete log of our votes in any consensus groups. Exiting.");
+ case ALLOW_UNSAFE_STARTUP:
case UNSAFE_STARTUP:
logger.warn("Stop marker is older than start marker ({}<{}), so cannot assume we have a complete log of our votes in any consensus groups. Continuing to startup as configured.", stopMarker, startMarker);
break;
@@ -611,7 +646,11 @@ private Long2LongHashMap restoreFromSavePoints(CommandStores commandStores)
private void distributedStartupInternal()
{
finishTopologyInitialization();
- WatermarkCollector.fetchAndReportWatermarksAsync(topology());
+ WatermarkCollector.fetchAndReportWatermarksAsync(topology())
+ .addCallback((success, failure) -> {
+ topologyService.afterStartup(node);
+
+ });
fastPathCoordinator.start();
ClusterMetadataService.instance().log().addListener(fastPathCoordinator);
@@ -638,8 +677,8 @@ private void distributedStartupInternal()
void catchup()
{
- AccordSpec spec = DatabaseDescriptor.getAccord();
- if (spec.catchup_on_start == DISABLED)
+ AccordConfig spec = DatabaseDescriptor.getAccord();
+ if (spec.catchup_on_start == CatchupMode.DISABLED)
{
logger.info("Catchup disabled; continuing to startup");
return;
diff --git a/src/java/org/apache/cassandra/service/accord/AccordTask.java b/src/java/org/apache/cassandra/service/accord/AccordTask.java
index d8311ba8c3b7..380474f7332a 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordTask.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordTask.java
@@ -85,9 +85,8 @@
import static accord.local.LoadKeysFor.WRITE;
import static accord.primitives.Routable.Domain.Key;
import static accord.primitives.Txn.Kind.EphemeralRead;
-import static accord.utils.Invariants.illegalState;
-import static org.apache.cassandra.config.CassandraRelevantProperties.DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED;
import static org.apache.cassandra.config.DatabaseDescriptor.getPartitioner;
+import static org.apache.cassandra.service.accord.AccordTask.State.ASSIGNED;
import static org.apache.cassandra.service.accord.AccordTask.State.CANCELLED;
import static org.apache.cassandra.service.accord.AccordTask.State.FAILED;
import static org.apache.cassandra.service.accord.AccordTask.State.FINISHED;
@@ -99,13 +98,14 @@
import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_LOAD;
import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_RUN;
import static org.apache.cassandra.service.accord.AccordTask.State.WAITING_TO_SCAN_RANGES;
+import static org.apache.cassandra.service.accord.debug.DebugExecution.DEBUG_EXECUTION;
+import static org.apache.cassandra.service.accord.debug.DebugExecution.DebugTask.SANITY_CHECK;
import static org.apache.cassandra.utils.Clock.Global.nanoTime;
public abstract class AccordTask extends SubmittableTask implements Function, Cancellable, DebuggableTask
{
private static final Logger logger = LoggerFactory.getLogger(AccordTask.class);
private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES);
- private static final boolean SANITY_CHECK = DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED.getBoolean();
static class ForFunction extends AccordTask
{
@@ -161,7 +161,8 @@ public enum State
WAITING_TO_LOAD(INITIALIZED, SCANNING_RANGES),
LOADING(INITIALIZED, SCANNING_RANGES, WAITING_TO_LOAD),
WAITING_TO_RUN(INITIALIZED, SCANNING_RANGES, WAITING_TO_LOAD, LOADING),
- RUNNING(WAITING_TO_RUN),
+ ASSIGNED(WAITING_TO_RUN),
+ RUNNING(ASSIGNED),
PERSISTING(RUNNING),
FINISHED(RUNNING, PERSISTING),
CANCELLED(WAITING_TO_SCAN_RANGES, SCANNING_RANGES, WAITING_TO_LOAD, LOADING, WAITING_TO_RUN),
@@ -218,8 +219,6 @@ boolean isComplete()
@Nullable private TaskQueue queued;
private BiConsumer super R, Throwable> callback;
- private List sanityCheck;
- public long createdAt = nanoTime(), waitingToRunAt, runningAt, completedAt;
public AccordTask(@Nonnull AccordCommandStore commandStore, PreLoadContext preLoadContext)
{
@@ -311,23 +310,6 @@ private void state(State state)
{
Invariants.require(rangeScanner == null || rangeScanner.scanned);
Invariants.require(loading == null && waitingToLoad == null, "WAITING_TO_RUN => no loading or waiting; found %s", this, AccordTask::toDescription);
- waitingToRunAt = nanoTime();
- commandStore.executor().elapsedPreparingToRun.increment(waitingToRunAt - createdAt, runningAt);
- }
- else if (state == RUNNING)
- {
- runningAt = nanoTime();
- if (waitingToRunAt == 0)
- {
- waitingToRunAt = runningAt;
- commandStore.executor().elapsedPreparingToRun.increment(waitingToRunAt - createdAt, runningAt);
- }
- commandStore.executor().elapsedWaitingToRun.increment(runningAt - waitingToRunAt, runningAt);
- commandStore.executor().keys.increment(commandsForKey == null ? 0 : commandsForKey.size(), runningAt);
- }
- else if (state.isExecuted() && completedAt == 0)
- {
- completedAt = nanoTime();
}
}
@@ -345,15 +327,34 @@ public AsyncChain chain()
@Override
protected Cancellable start(BiConsumer super R, Throwable> callback)
{
- Invariants.require(AccordTask.this.callback == null);
- AccordTask.this.callback = callback;
- commandStore.tryPreSetup(AccordTask.this);
+ preSetup(callback);
commandStore.executor().submit(AccordTask.this);
return AccordTask.this;
}
};
}
+ public AsyncChain priorityChain()
+ {
+ return new AsyncChains.Head<>()
+ {
+ @Override
+ protected Cancellable start(BiConsumer super R, Throwable> callback)
+ {
+ preSetup(callback);
+ commandStore.executor().submitPriority(AccordTask.this);
+ return AccordTask.this;
+ }
+ };
+ }
+
+ private void preSetup(BiConsumer super R, Throwable> callback)
+ {
+ Invariants.require(this.callback == null);
+ this.callback = callback;
+ commandStore.tryPreSetup(this);
+ }
+
// to be invoked only by the CommandStore owning thread, to take references to objects already in use by the current execution
public void presetup(AccordTask> parent)
{
@@ -628,22 +629,21 @@ private void maybeSanityCheck(AccordSafeCommand safeCommand)
{
if (SANITY_CHECK)
{
- if (sanityCheck == null)
- sanityCheck = new ArrayList<>(commands.size());
- sanityCheck.add(safeCommand.current());
+ if (debug.sanityCheck == null)
+ debug.sanityCheck = new ArrayList<>(commands.size());
+ debug.sanityCheck.add(safeCommand.current());
}
}
private void save(List diffs, Runnable onFlush)
{
- if (sanityCheck != null)
+ if (SANITY_CHECK && debug.sanityCheck != null)
{
- Invariants.require(SANITY_CHECK);
Condition condition = Condition.newOneTimeCondition();
this.commandStore.appendCommands(diffs, condition::signal);
condition.awaitUninterruptibly();
- for (Command check : sanityCheck)
+ for (Command check : debug.sanityCheck)
this.commandStore.sanityCheckCommand(commandStore.unsafeGetRedundantBefore(), check);
if (onFlush != null) onFlush.run();
@@ -655,9 +655,9 @@ private void save(List diffs, Runnable onFlush)
}
@Override
- protected void preRunExclusive()
+ protected void preRunExclusive(Thread assigned)
{
- state(RUNNING);
+ state(ASSIGNED);
queued = null;
if (rangeScanner != null)
{
@@ -673,6 +673,7 @@ protected void preRunExclusive()
@Override
public void runInternal()
{
+ runningAt = nanoTime();
logger.trace("Running {} with state {}", this, state);
AccordSafeCommandStore safeStore = null;
try (Closeable close = locals.get())
@@ -680,8 +681,7 @@ public void runInternal()
if (Tracing.isTracing())
Tracing.trace(preLoadContext.describe());
- if (state != RUNNING)
- throw illegalState("Unexpected state " + toDescription());
+ state(RUNNING);
safeStore = commandStore.begin(this, commandsForRanges);
R result = apply(safeStore);
@@ -717,6 +717,7 @@ public void runInternal()
commandStore.complete(safeStore);
safeStore = null;
+ if (DEBUG_EXECUTION) debug.onRunComplete();
if (!flush)
finish(result, null);
}
@@ -729,10 +730,6 @@ public void runInternal()
}
throw t;
}
- finally
- {
- logger.trace("Exiting {}", this);
- }
}
public void fail(Throwable throwable)
@@ -742,8 +739,8 @@ public void fail(Throwable throwable)
try
{
- commandStore.agent().onException(throwable);
state(FAILED);
+ commandStore.agent().onException(throwable);
}
finally
{
@@ -757,17 +754,16 @@ public void failExclusive(Throwable throwable)
fail(throwable);
}
- protected void cleanupExclusive()
+ @Override
+ protected void cleanupExclusive(AccordExecutor executor)
{
+ super.cleanupExclusive(executor);
Invariants.expect(state.isExecuted());
releaseResources(commandStore.cachesExclusive());
- if (runningAt != 0)
- {
- commandStore.executor().elapsedRunning.increment(completedAt - runningAt, completedAt);
- }
+ executor.keys.increment(commandsForKey == null ? 0 : commandsForKey.size(), runningAt);
if (histogramBuffer != null)
{
- histogramBuffer.flush(completedAt);
+ histogramBuffer.flush(cleanupAt);
histogramBuffer = null;
}
}
@@ -786,16 +782,21 @@ public boolean hasRanges()
@Override
public void cancel()
{
- commandStore.executor().cancel(this);
+ if (!state.isComplete())
+ commandStore.executor().cancel(this);
}
- public void cancelExclusive()
+ void cancelExclusive()
{
+ logger.info("Cancelling {}", preLoadContext);
state(CANCELLED);
if (rangeScanner != null)
rangeScanner.cancelled = true;
if (callback != null)
- commandStore.executor().submit(() -> callback.accept(null, new CancellationException()));
+ {
+ if (commandStore.executor().isInLoop()) callback.accept(null, new CancellationException());
+ else commandStore.executor().submit(() -> callback.accept(null, new CancellationException()));
+ }
}
void cancelExclusive(AccordExecutor owner)
@@ -1064,7 +1065,7 @@ CommandSummaries finish(Caches caches)
public class RangeTxnScanner extends AccordExecutor.AbstractIOTask
{
final Map summaries = new HashMap<>();
- final Map mutexSummaries = Collections.synchronizedMap(summaries);
+ final Map guardedSummaries = Collections.synchronizedMap(summaries);
RangeIndex.Loader loader;
boolean scanned;
@@ -1074,7 +1075,7 @@ public class RangeTxnScanner extends AccordExecutor.AbstractIOTask
protected void runInternal()
{
- loader.load(mutexSummaries, () -> cancelled);
+ loader.load(guardedSummaries, () -> cancelled);
}
PreLoadContext preLoadContext()
@@ -1105,7 +1106,7 @@ public void start(AccordExecutor executor)
void startInternal(Caches caches)
{
loader = commandStore.rangeIndex().loader(preLoadContext.primaryTxnId(), preLoadContext.executeAt(), preLoadContext.loadKeysFor(), preLoadContext.keys());
- loader.loadExclusive(mutexSummaries, caches);
+ loader.loadExclusive(guardedSummaries, caches);
}
public void scannedExclusive()
diff --git a/src/java/org/apache/cassandra/service/accord/AccordVerbHandler.java b/src/java/org/apache/cassandra/service/accord/AccordVerbHandler.java
index 91adfafd5822..cc270ce7e286 100644
--- a/src/java/org/apache/cassandra/service/accord/AccordVerbHandler.java
+++ b/src/java/org/apache/cassandra/service/accord/AccordVerbHandler.java
@@ -23,14 +23,21 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import accord.api.Tracing;
+import accord.local.MapReduceCommandStores;
import accord.local.Node;
+import accord.messages.AbstractRequest;
import accord.messages.Request;
+import accord.utils.Invariants;
import org.apache.cassandra.net.IVerbHandler;
import org.apache.cassandra.net.Message;
+import org.apache.cassandra.service.accord.debug.AccordRemoteTracing;
import org.apache.cassandra.service.accord.topology.AccordEndpointMapper;
import org.apache.cassandra.utils.NoSpamLogger;
+import static org.apache.cassandra.net.ParamType.ACCORD_TRACING;
+
public class AccordVerbHandler implements IVerbHandler
{
private static final Logger logger = LoggerFactory.getLogger(AccordVerbHandler.class);
@@ -51,6 +58,12 @@ public void doVerb(Message message) throws IOException
logger.trace("Receiving {} from {}", message.payload, message.from());
T request = message.payload;
+ Tracing tracing = (Tracing) message.header.params().get(ACCORD_TRACING);
+ if (tracing != null && request instanceof AbstractRequest, ?>)
+ {
+ Invariants.require(tracing instanceof AccordRemoteTracing);
+ ((MapReduceCommandStores, ?>) request).setTracing(tracing);
+ }
/*
* TODO (desired): messages are retained on heap until the node catches up to waitForEpoch,
diff --git a/src/java/org/apache/cassandra/service/accord/RangeIndex.java b/src/java/org/apache/cassandra/service/accord/RangeIndex.java
index 1d744b2d4df4..a31179a4ebaa 100644
--- a/src/java/org/apache/cassandra/service/accord/RangeIndex.java
+++ b/src/java/org/apache/cassandra/service/accord/RangeIndex.java
@@ -27,6 +27,8 @@
import accord.local.CommandSummaries;
import accord.local.LoadKeysFor;
import accord.local.MaxDecidedRX;
+import accord.local.MinimalCommand;
+import accord.local.MinimalCommand.MinimalWithDeps;
import accord.local.RedundantBefore;
import accord.primitives.Ranges;
import accord.primitives.Routable;
@@ -66,13 +68,13 @@ protected CommandSummaries.Summary loadFromDisk(TxnId txnId)
{
if (loadKeysFor != RECOVERY)
{
- Command.Minimal cmd = commandStore().loadMinimal(txnId);
+ MinimalCommand cmd = commandStore().loadMinimal(txnId);
if (cmd != null)
return ifRelevant(cmd);
}
else
{
- Command.MinimalWithDeps cmd = commandStore().loadMinimalWithDeps(txnId);
+ MinimalWithDeps cmd = commandStore().loadMinimalWithDeps(txnId);
if (cmd != null)
return ifRelevant(cmd);
}
diff --git a/src/java/org/apache/cassandra/service/accord/TokenRange.java b/src/java/org/apache/cassandra/service/accord/TokenRange.java
index d890e5d012ea..5be1b295c520 100644
--- a/src/java/org/apache/cassandra/service/accord/TokenRange.java
+++ b/src/java/org/apache/cassandra/service/accord/TokenRange.java
@@ -22,7 +22,6 @@
import com.google.common.annotations.VisibleForTesting;
-import accord.api.ProtocolModifiers.RangeSpec;
import accord.api.RoutingKey;
import accord.primitives.Range;
import accord.utils.Invariants;
@@ -38,12 +37,14 @@
import org.apache.cassandra.service.accord.api.TokenKey;
import org.apache.cassandra.utils.ObjectSizes;
+import static accord.api.ProtocolModifiers.isRangeEndInclusive;
+
public class TokenRange extends Range
{
public static final long EMPTY_SIZE = ObjectSizes.measure(new TokenRange(TokenKey.min(TableId.fromLong(0), Murmur3Partitioner.instance), TokenKey.max(TableId.fromLong(0), Murmur3Partitioner.instance)));
static
{
- Invariants.require(RangeSpec.isEndInclusive());
+ Invariants.require(isRangeEndInclusive());
}
// Don't make this public use create or createUnsafe
diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordAgent.java b/src/java/org/apache/cassandra/service/accord/api/AccordAgent.java
index 8d1a7ab90ae7..8d43b320fd9b 100644
--- a/src/java/org/apache/cassandra/service/accord/api/AccordAgent.java
+++ b/src/java/org/apache/cassandra/service/accord/api/AccordAgent.java
@@ -35,6 +35,7 @@
import accord.api.OwnershipEventListener;
import accord.api.ProgressLog.BlockedUntil;
import accord.api.ReplicaEventListener;
+import accord.api.Result;
import accord.api.RoutingKey;
import accord.api.Tracing;
import accord.coordinate.Coordination;
@@ -47,12 +48,11 @@
import accord.local.SafeCommand;
import accord.local.SafeCommandStore;
import accord.local.TimeService;
-import accord.messages.ReplyContext;
+import accord.messages.MessageType;
import accord.primitives.Keys;
import accord.primitives.Participants;
import accord.primitives.Ranges;
import accord.primitives.Routable;
-import accord.primitives.Status;
import accord.primitives.Timestamp;
import accord.primitives.Txn;
import accord.primitives.Txn.Kind;
@@ -68,19 +68,19 @@
import accord.utils.async.AsyncChains;
import accord.utils.async.Cancellable;
-import org.apache.cassandra.config.AccordSpec;
+import org.apache.cassandra.config.AccordConfig;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.SystemKeyspace;
import org.apache.cassandra.exceptions.RequestTimeoutException;
import org.apache.cassandra.metrics.AccordReplicaMetrics;
import org.apache.cassandra.metrics.AccordSystemMetrics;
-import org.apache.cassandra.net.ResponseContext;
import org.apache.cassandra.service.RetryStrategy;
import org.apache.cassandra.service.accord.AccordService;
import org.apache.cassandra.service.accord.debug.AccordTracing;
import org.apache.cassandra.service.accord.serializers.TableMetadatasAndKeys;
import org.apache.cassandra.service.accord.txn.TxnQuery;
import org.apache.cassandra.service.accord.txn.TxnRead;
+import org.apache.cassandra.service.accord.txn.TxnResult;
import org.apache.cassandra.utils.Clock;
import org.apache.cassandra.utils.JVMStabilityInspector;
import org.apache.cassandra.utils.NoSpamLogger;
@@ -95,6 +95,8 @@
import static org.apache.cassandra.config.DatabaseDescriptor.getAccordScheduleDurabilityTxnIdLag;
import static org.apache.cassandra.config.DatabaseDescriptor.getReadRpcTimeout;
import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.expireEpochWait;
+import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.expireSyncPoint;
+import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.expireTxn;
import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.fetch;
import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.recover;
import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.retryBootstrap;
@@ -104,7 +106,7 @@
import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.retrySyncPoint;
import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.slowRead;
import static org.apache.cassandra.service.accord.api.AccordWaitStrategies.slowTxnPreaccept;
-import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+import static org.apache.cassandra.service.accord.txn.TxnResult.Kind.txn_data;
// TODO (expected): merge with AccordService
public class AccordAgent implements Agent, OwnershipEventListener
@@ -124,7 +126,7 @@ public static void onFailedBarrier(TxnId txnId, Throwable cause)
private final AccordTracing tracing = new AccordTracing();
private final RandomSource random = new DefaultRandom();
protected Node.Id self;
- protected AccordSpec config;
+ protected AccordConfig config;
public AccordAgent()
{
@@ -248,27 +250,19 @@ public boolean rejectPreAccept(TimeService time, TxnId txnId)
@Override
public long cfkHlcPruneDelta()
{
- return SECONDS.toMicros(10L);
+ return config.commands_for_key_prune_delta.to(MICROSECONDS);
}
@Override
public int cfkPruneInterval()
{
- return 32;
+ return config.commands_for_key_prune_interval;
}
- // TODO (expected): we probably want additional configuration here
@Override
public long maxConflictsHlcPruneDelta()
{
- return SECONDS.toMicros(1);
- }
-
- // TODO (expected): I don't think we even need this - just prune each time we have doubled in size
- @Override
- public long maxConflictsPruneInterval()
- {
- return 1024;
+ return config.max_conflicts_prune_delta.to(MICROSECONDS);
}
@Override
@@ -456,40 +450,27 @@ public long expireEpochWait(TimeUnit units)
}
@Override
- public long expiresAt(ReplyContext replyContext, TimeUnit unit)
+ public long selfSlowAt(TxnId txnId, MessageType type, TimeUnit unit)
{
- return unit.convert(((ResponseContext)replyContext).expiresAtNanos(), NANOSECONDS);
- }
-
- @Override
- public long selfSlowAt(TxnId txnId, Status.Phase phase, TimeUnit unit)
- {
- switch (phase)
+ if (type.getClass() == MessageType.StandardMessage.class)
{
- default: throw new UnhandledEnum(phase);
- case PreAccept: return unit.convert(slowTxnPreaccept.computeWaitUntil(1), NANOSECONDS);
- case Execute: return unit.convert(slowRead.computeWaitUntil(1), NANOSECONDS);
+ switch ((MessageType.StandardMessage)type)
+ {
+ case PRE_ACCEPT_REQ:
+ return unit.convert(slowTxnPreaccept.computeWaitUntil(1), unit);
+ case READ_EPHEMERAL_REQ:
+ case READ_REQ:
+ case STABLE_THEN_READ_REQ:
+ return unit.convert(slowRead.computeWaitUntil(1), NANOSECONDS);
+ }
}
+ return -1;
}
@Override
- public long selfExpiresAt(TxnId txnId, Status.Phase phase, TimeUnit unit)
+ public long selfExpiresAt(TxnId txnId, MessageType type, TimeUnit unit)
{
- long delayNanos;
- switch (txnId.kind())
- {
- default: throw new UnhandledEnum(txnId.kind());
- case Write:
- delayNanos = DatabaseDescriptor.getWriteRpcTimeout(NANOSECONDS);
- break;
- case EphemeralRead:
- case Read:
- delayNanos = DatabaseDescriptor.getReadRpcTimeout(NANOSECONDS);
- break;
- case ExclusiveSyncPoint:
- delayNanos = DatabaseDescriptor.getAccordRangeSyncPointTimeoutNanos();
- }
- return unit.convert(nanoTime() + delayNanos, NANOSECONDS);
+ return unit.convert((txnId.isSyncPoint() ? expireSyncPoint : expireTxn).computeWaitUntil(1), NANOSECONDS);
}
@Override
@@ -516,4 +497,10 @@ public long minStaleHlc(Node node, boolean requested)
{
return node.now() - (100 + getAccordScheduleDurabilityTxnIdLag(MICROSECONDS));
}
+
+ @Override
+ public boolean reportRemoteSuccess(Result success)
+ {
+ return success instanceof TxnResult && ((TxnResult) success).kind() == txn_data;
+ }
}
diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordScheduler.java b/src/java/org/apache/cassandra/service/accord/api/AccordScheduler.java
index c0c0482f10b6..701620c56749 100644
--- a/src/java/org/apache/cassandra/service/accord/api/AccordScheduler.java
+++ b/src/java/org/apache/cassandra/service/accord/api/AccordScheduler.java
@@ -19,6 +19,7 @@
package org.apache.cassandra.service.accord.api;
import java.util.List;
+import java.util.concurrent.Executor;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
@@ -29,7 +30,7 @@
import org.apache.cassandra.concurrent.ScheduledExecutorPlus;
import org.apache.cassandra.concurrent.Shutdownable;
-public class AccordScheduler implements Scheduler, Shutdownable
+public class AccordScheduler implements Scheduler, Shutdownable, Executor
{
private final ScheduledExecutorPlus scheduledExecutor = ExecutorFactory.Global.executorFactory().scheduled(false, "AccordScheduled");
@@ -112,4 +113,10 @@ public boolean awaitTermination(long timeout, TimeUnit units) throws Interrupted
{
return scheduledExecutor.awaitTermination(timeout, units);
}
+
+ @Override
+ public void execute(Runnable command)
+ {
+ scheduledExecutor.execute(command);
+ }
}
diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordTimeService.java b/src/java/org/apache/cassandra/service/accord/api/AccordTimeService.java
index b800e30b110a..c7842c749337 100644
--- a/src/java/org/apache/cassandra/service/accord/api/AccordTimeService.java
+++ b/src/java/org/apache/cassandra/service/accord/api/AccordTimeService.java
@@ -22,13 +22,22 @@
import accord.local.TimeService;
-import org.apache.cassandra.utils.Clock;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.utils.FBUtilities;
import static java.util.concurrent.TimeUnit.NANOSECONDS;
+import static org.apache.cassandra.utils.Clock.Global.currentTimeMicros;
import static org.apache.cassandra.utils.Clock.Global.nanoTime;
public class AccordTimeService implements TimeService
{
+ private static final boolean PRECISE_MICROS;
+ static
+ {
+ Boolean precise = DatabaseDescriptor.getAccord().precise_micros;
+ PRECISE_MICROS = precise == null || precise;
+ }
+
@Override
public long now()
{
@@ -37,7 +46,7 @@ public long now()
public static long nowMicros()
{
- return TimeUnit.MILLISECONDS.toMicros(Clock.Global.currentTimeMillis());
+ return PRECISE_MICROS ? currentTimeMicros() : FBUtilities.timestampMicros();
}
@Override
diff --git a/src/java/org/apache/cassandra/service/accord/api/AccordWaitStrategies.java b/src/java/org/apache/cassandra/service/accord/api/AccordWaitStrategies.java
index b48056f4257b..cf7d8ff5ae2a 100644
--- a/src/java/org/apache/cassandra/service/accord/api/AccordWaitStrategies.java
+++ b/src/java/org/apache/cassandra/service/accord/api/AccordWaitStrategies.java
@@ -22,7 +22,7 @@
import accord.primitives.TxnId;
-import org.apache.cassandra.config.AccordSpec;
+import org.apache.cassandra.config.AccordConfig;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.StringRetryStrategy;
import org.apache.cassandra.net.Verb;
@@ -84,7 +84,7 @@ public static RetryStrategy retryFetchTopology()
static
{
- AccordSpec config = DatabaseDescriptor.getAccord();
+ AccordConfig config = DatabaseDescriptor.getAccord();
setSlowRead(config.slow_read);
setSlowTxnPreaccept(config.slow_txn_preaccept);
setSlowSyncPointPreaccept(config.slow_syncpoint_preaccept);
diff --git a/src/java/org/apache/cassandra/service/accord/debug/AccordRemoteTracing.java b/src/java/org/apache/cassandra/service/accord/debug/AccordRemoteTracing.java
new file mode 100644
index 000000000000..80617d483c99
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/accord/debug/AccordRemoteTracing.java
@@ -0,0 +1,385 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.accord.debug;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import accord.api.Tracing;
+import accord.local.CommandStore;
+import accord.primitives.TxnId;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.net.IVerbHandler;
+import org.apache.cassandra.net.Message;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.net.ParamType;
+import org.apache.cassandra.net.Verb;
+import org.apache.cassandra.service.accord.AccordService;
+import org.apache.cassandra.service.accord.api.AccordAgent;
+import org.apache.cassandra.service.accord.api.AccordTimeService;
+import org.apache.cassandra.service.accord.debug.AccordTracing.TxnEvent;
+import org.apache.cassandra.service.accord.serializers.CommandSerializers;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.membership.NodeId;
+
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+import static org.apache.cassandra.utils.MonotonicClock.Global.preciseTime;
+
+public class AccordRemoteTracing implements Tracing
+{
+ public static class AccordRemoteTrace {}
+ public static final class AccordTraceOut extends AccordRemoteTrace
+ {
+ final TxnId txnId;
+ final long idMicros;
+ final long receivedAtMicros;
+ final long nanosSinceReceived;
+ final int commandStoreId;
+ final String message;
+
+ AccordTraceOut(TxnId txnId, long idMicros, long receivedAtMicros, long nanosSinceReceived, int commandStoreId, String message)
+ {
+ this.txnId = txnId;
+ this.idMicros = idMicros;
+ this.receivedAtMicros = receivedAtMicros;
+ this.nanosSinceReceived = nanosSinceReceived;
+ this.commandStoreId = commandStoreId;
+ this.message = message;
+ }
+ }
+
+ public static final class AccordTraceIn extends AccordRemoteTrace
+ {
+ final TxnId txnId;
+ final long idMicros;
+ final long atNanos;
+ final int commandStoreId;
+ final String message;
+
+ AccordTraceIn(TxnId txnId, long idMicros, long atNanos, int commandStoreId, String message)
+ {
+ this.txnId = txnId;
+ this.idMicros = idMicros;
+ this.atNanos = atNanos;
+ this.commandStoreId = commandStoreId;
+ this.message = message;
+ }
+ }
+
+ public static final class AccordTracingIn implements Tracing
+ {
+ final TxnId txnId;
+ final long idMicros;
+ final long onWireAtNanos;
+ final List messages;
+
+ public AccordTracingIn(TxnId txnId, long idMicros, long onWireAtNanos, List messages)
+ {
+ this.txnId = txnId;
+ this.idMicros = idMicros;
+ this.onWireAtNanos = onWireAtNanos;
+ this.messages = messages;
+ }
+
+ void report(NodeId from)
+ {
+ AccordTracing tracing = ((AccordAgent)AccordService.unsafeInstance().agent()).tracing();
+ long offWireAtNanos = nanoTime();
+ long onWireAtNanos = Math.min(offWireAtNanos - 1, this.onWireAtNanos);
+ int fromId = from == null ? Integer.MAX_VALUE : from.id();
+ for (AccordTraceIn message : messages)
+ tracing.report(message, fromId);
+ tracing.report(new AccordTraceIn(txnId, idMicros, onWireAtNanos, -1, "Reply on wire"), fromId);
+ tracing.report(new AccordTraceIn(txnId, idMicros, offWireAtNanos, -1, "Reply off wire from " + from) , -1);
+ }
+
+ @Override
+ public void trace(CommandStore commandStore, String message)
+ {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ public static final class AccordTracingOut implements Tracing
+ {
+ final TxnId txnId;
+ final long idMicros;
+ final long receivedAtMicros;
+ final long receivedAtNanos;
+ final List messages;
+
+ AccordTracingOut(TxnId txnId, long idMicros, long receivedAtMicros, long receivedAtNanos, List messages)
+ {
+ this.txnId = txnId;
+ this.idMicros = idMicros;
+ this.receivedAtMicros = receivedAtMicros;
+ this.receivedAtNanos = receivedAtNanos;
+ this.messages = messages;
+ }
+
+ @Override
+ public void trace(CommandStore commandStore, String message)
+ {
+ if (message.length() > 100)
+ message = message.substring(0, 100);
+ messages.add(new BufferedMessage(nanoTime(), commandStore == null ? -1 : commandStore.id(), message));
+ }
+ }
+
+ static class BufferedMessage
+ {
+ final long atNanos;
+ final int commandStoreId;
+ final String message;
+
+ BufferedMessage(long atNanos, int commandStoreId, String message)
+ {
+ this.atNanos = atNanos;
+ this.commandStoreId = commandStoreId;
+ this.message = message;
+ }
+ }
+
+ final TxnId txnId;
+ final long idMicros;
+ private InetAddressAndPort replyTo;
+ final long receivedAtMicros = AccordTimeService.nowMicros();
+ final long receivedAtNanos = nanoTime();
+ List messages = new ArrayList<>();
+
+ private AccordRemoteTracing(TxnId txnId, long idMicros)
+ {
+ this.txnId = txnId;
+ this.idMicros = idMicros;
+ }
+
+ public void setReplyTo(InetAddressAndPort from)
+ {
+ replyTo = from;
+ }
+
+ @Override
+ public synchronized void trace(CommandStore commandStore, String message)
+ {
+ long atNanos = nanoTime();
+ int commandStoreId = commandStore == null ? -1 : commandStore.id();
+ // TODO (expected): make this configurable
+ if (message.length() > 100)
+ message = message.substring(0, 100);
+
+ if (messages == null)
+ {
+ long nanosSinceReceived = atNanos - receivedAtNanos;
+ Message> reply = Message.out(Verb.ACCORD_REMOTE_TRACE, new AccordTraceOut(txnId, idMicros, receivedAtMicros, nanosSinceReceived, commandStoreId, message));
+ MessagingService.instance().send(reply, replyTo);
+ }
+ else
+ {
+ messages.add(new BufferedMessage(atNanos, commandStoreId, message));
+ }
+ }
+
+ @Override
+ public synchronized Tracing send()
+ {
+ List messages = this.messages;
+ if (messages == null) messages = new ArrayList<>();
+ else this.messages = null;
+ return new AccordTracingOut(txnId, idMicros, receivedAtMicros, receivedAtNanos, messages);
+ }
+
+ public static final IVerbHandler traceMessageHandler = message -> {
+ AccordTracing tracing = ((AccordAgent)AccordService.unsafeInstance().agent()).tracing();
+ AccordTraceIn trace = (AccordTraceIn) message.payload;
+ NodeId nodeId = ClusterMetadata.current().directory.peerId(message.from());
+ tracing.report(trace, nodeId == null ? Integer.MAX_VALUE : nodeId.id());
+ };
+
+ static final int REPLY_FLAG = 1;
+ public static final IVersionedSerializer tracingSerializer = new IVersionedSerializer<>()
+ {
+ @Override
+ public void serialize(Tracing t, DataOutputPlus out, int version) throws IOException
+ {
+ int flags = 0;
+ if (t.getClass() == TxnEvent.class)
+ {
+ TxnEvent event = (TxnEvent) t;
+ out.writeUnsignedVInt32(flags);
+ CommandSerializers.txnId.serialize(event.txnId(), out);
+ out.writeLong(event.idMicros);
+ }
+ else
+ {
+ AccordTracingOut tracing = (AccordTracingOut) t;
+ flags |= REPLY_FLAG;
+ out.writeUnsignedVInt32(flags);
+ CommandSerializers.txnId.serialize(tracing.txnId, out);
+ out.writeLong(tracing.idMicros);
+ out.writeLong(tracing.receivedAtMicros);
+ out.writeLong(nanoTime() - tracing.receivedAtNanos);
+ out.writeUnsignedVInt32(tracing.messages.size());
+ for (BufferedMessage message : tracing.messages)
+ {
+ out.writeUnsignedVInt(message.atNanos - tracing.receivedAtNanos);
+ out.writeUnsignedVInt32(1 + message.commandStoreId);
+ out.writeUTF(message.message);
+ }
+ }
+ }
+
+ @Override
+ public Tracing deserialize(DataInputPlus in, int version) throws IOException
+ {
+ int flags = in.readUnsignedVInt32();
+ TxnId txnId = CommandSerializers.txnId.deserialize(in);
+ long idMicros = in.readLong();
+ if ((flags & REPLY_FLAG) == 0)
+ return new AccordRemoteTracing(txnId, idMicros);
+
+ long remoteReceivedAtMicros = in.readLong();
+ long remoteReceivedAtNanos = preciseTime.translate().fromMicrosSinceEpoch(remoteReceivedAtMicros);
+ long onWireAtNanos = remoteReceivedAtNanos + in.readLong();
+ int messageCount = in.readUnsignedVInt32();
+ List messages = new ArrayList<>(messageCount);
+ for (int i = 0 ; i < messageCount ; ++i)
+ {
+ long atNanos = remoteReceivedAtNanos + in.readUnsignedVInt();
+ int commandStoreId = in.readUnsignedVInt32() - 1;
+ String message = in.readUTF();
+ messages.add(new AccordTraceIn(txnId, idMicros, atNanos, commandStoreId, message));
+ }
+ return new AccordTracingIn(txnId, idMicros, onWireAtNanos, messages);
+ }
+
+ @Override
+ public long serializedSize(Tracing t, int version)
+ {
+ int flags = 0;
+ if (t.getClass() == TxnEvent.class)
+ {
+ TxnEvent event = (TxnEvent) t;
+ long size = TypeSizes.sizeofUnsignedVInt(flags);
+ size += CommandSerializers.txnId.serializedSize(event.txnId());
+ size += TypeSizes.LONG_SIZE;
+ return size;
+ }
+ else
+ {
+ AccordTracingOut tracing = (AccordTracingOut) t;
+ flags |= REPLY_FLAG;
+ long size = TypeSizes.sizeofUnsignedVInt(flags);
+ size += CommandSerializers.txnId.serializedSize(tracing.txnId);
+ size += 3 * TypeSizes.LONG_SIZE;
+ size += TypeSizes.sizeofUnsignedVInt(tracing.messages.size());
+ for (BufferedMessage message : tracing.messages)
+ {
+ size += TypeSizes.sizeofUnsignedVInt(message.atNanos - tracing.receivedAtNanos);
+ size += TypeSizes.sizeofUnsignedVInt(1 + message.commandStoreId);
+ size += TypeSizes.sizeof(message.message);
+ }
+ return size;
+ }
+ }
+ };
+
+ public static final IVersionedSerializer traceSerializer = new IVersionedSerializer<>()
+ {
+ @Override
+ public void serialize(AccordRemoteTrace t, DataOutputPlus out, int version) throws IOException
+ {
+ out.writeUnsignedVInt32(0);
+ AccordTraceOut trace = (AccordTraceOut) t;
+ CommandSerializers.txnId.serialize(trace.txnId, out);
+ out.writeUnsignedVInt(trace.idMicros);
+ out.writeUnsignedVInt(trace.receivedAtMicros);
+ out.writeUnsignedVInt(trace.nanosSinceReceived);
+ out.writeUnsignedVInt32(1 + trace.commandStoreId);
+ out.writeUTF(trace.message);
+ }
+
+ @Override
+ public AccordRemoteTrace deserialize(DataInputPlus in, int version) throws IOException
+ {
+ in.readUnsignedVInt32();
+ TxnId txnId = CommandSerializers.txnId.deserialize(in);
+ long idMicros = in.readUnsignedVInt();
+ long receivedAtMicros = in.readUnsignedVInt();
+ long nanosSinceReceived = in.readUnsignedVInt();
+ long atNanos = nanosSinceReceived + preciseTime.translate().fromMicrosSinceEpoch(receivedAtMicros);
+ int commandStoreId = 1 + in.readUnsignedVInt32();
+ String message = in.readUTF();
+ return new AccordTraceIn(txnId, idMicros, atNanos, commandStoreId, message);
+ }
+
+ @Override
+ public long serializedSize(AccordRemoteTrace t, int version)
+ {
+ AccordTraceOut trace = (AccordTraceOut) t;
+ long size = TypeSizes.sizeofUnsignedVInt(0);
+ size += CommandSerializers.txnId.serializedSize(trace.txnId);
+ size += TypeSizes.sizeofUnsignedVInt(trace.idMicros);
+ size += TypeSizes.sizeofUnsignedVInt(trace.receivedAtMicros);
+ size += TypeSizes.sizeofUnsignedVInt(trace.nanosSinceReceived);
+ size += TypeSizes.sizeofUnsignedVInt(1 + trace.commandStoreId);
+ size += TypeSizes.sizeof(trace.message);
+ return size;
+ }
+ };
+
+ public static void traceOnWire(Message.Header out, InetAddressAndPort to)
+ {
+ Object obj = out.params().get(ParamType.ACCORD_TRACING);
+ if (obj == null)
+ return;
+
+ Tracing tracing = (Tracing) obj;
+ if (tracing instanceof TxnEvent)
+ {
+ NodeId id = ClusterMetadata.current().directory.peerId(to);
+ tracing.trace(null, "Request on wire to %s", id);
+ }
+ }
+
+ public static void traceOffWire(Message.Header in)
+ {
+ Object obj = in.params().get(ParamType.ACCORD_TRACING);
+ if (obj == null)
+ return;
+
+ if (obj instanceof AccordRemoteTracing)
+ {
+ AccordRemoteTracing tracing = (AccordRemoteTracing) obj;
+ tracing.setReplyTo(in.from);
+ tracing.trace(null, "Request off wire");
+ }
+ else
+ {
+ AccordTracingIn tracing = (AccordTracingIn) obj;
+ NodeId id = ClusterMetadata.current().directory.peerId(in.from);
+ tracing.report(id);
+ }
+ }
+}
diff --git a/src/java/org/apache/cassandra/service/accord/debug/AccordTracing.java b/src/java/org/apache/cassandra/service/accord/debug/AccordTracing.java
index 2e614e8f6658..570765392f94 100644
--- a/src/java/org/apache/cassandra/service/accord/debug/AccordTracing.java
+++ b/src/java/org/apache/cassandra/service/accord/debug/AccordTracing.java
@@ -22,8 +22,11 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumMap;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.TreeSet;
+import java.util.WeakHashMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ThreadLocalRandom;
@@ -46,6 +49,7 @@
import accord.local.CommandStore;
import accord.local.Node;
import accord.primitives.Participants;
+import accord.primitives.Routables;
import accord.primitives.TxnId;
import accord.utils.Invariants;
import accord.utils.SortedListMap;
@@ -54,11 +58,14 @@
import org.apache.cassandra.metrics.AccordCoordinatorMetrics;
import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.service.accord.debug.AccordRemoteTracing.AccordTraceIn;
import org.apache.cassandra.utils.Clock;
import org.apache.cassandra.utils.NoSpamLogger;
import static org.apache.cassandra.service.accord.debug.AccordTracing.BucketMode.LEAKY;
import static org.apache.cassandra.service.accord.debug.AccordTracing.BucketMode.SAMPLE;
+import static org.apache.cassandra.service.accord.debug.AccordTracing.BucketMode.SLOWEST;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
public class AccordTracing extends AccordCoordinatorMetrics.Listener
{
@@ -68,16 +75,20 @@ public class AccordTracing extends AccordCoordinatorMetrics.Listener
public enum BucketMode
{
- LEAKY, SAMPLE, RING;
+ LEAKY, SAMPLE, RING, SLOWEST;
int position(int permits, int total)
{
switch (this)
{
default: throw UnhandledEnum.unknown(this);
- case LEAKY: return Integer.MAX_VALUE;
- case RING: return total % permits;
- case SAMPLE: return ThreadLocalRandom.current().nextInt(total);
+ case SLOWEST:
+ case LEAKY:
+ return Integer.MAX_VALUE;
+ case RING:
+ return total % permits;
+ case SAMPLE:
+ return ThreadLocalRandom.current().nextInt(total);
}
}
}
@@ -90,14 +101,26 @@ public interface ConsumeState
public static class Message
{
public final long atNanos;
+ public final int nodeId;
public final int commandStoreId;
public final String message;
- Message(int commandStoreId, String message, long atLeastNanos)
+ static Message withAtLeastNanos(int nodeId, int commandStoreId, String message, long atLeastNanos)
{
+ return new Message(nodeId, commandStoreId, message, Math.max(atLeastNanos, nanoTime()));
+ }
+
+ static Message withExactNanos(int nodeId, int commandStoreId, String message, long atNanos)
+ {
+ return new Message(nodeId, commandStoreId, message, atNanos);
+ }
+
+ private Message(int nodeId, int commandStoreId, String message, long atNanos)
+ {
+ this.atNanos = atNanos;
+ this.nodeId = nodeId;
this.commandStoreId = commandStoreId;
this.message = message;
- this.atNanos = Math.max(atLeastNanos, Clock.Global.nanoTime());
}
@Override
@@ -109,25 +132,29 @@ public String toString()
public static class TxnEvent implements Tracing, Comparable
{
+ public final TxnEvents parent;
public final CoordinationKind kind;
public final long idMicros = uniqueNowMicros();
- public final long atNanos = Clock.Global.nanoTime();
+ public final long atNanos = nanoTime();
final List messages = new ArrayList<>();
int index = -1, subIndex = -1;
+ long elapsedNanos;
- public TxnEvent(CoordinationKind kind)
+ public TxnEvent(TxnEvents parent, CoordinationKind kind)
{
+ this.parent = parent;
this.kind = kind;
}
@Override
- public void trace(CommandStore commandStore, String s)
+ public synchronized void trace(CommandStore commandStore, String s)
{
long prevNanos = messages.isEmpty() ? 0 : messages.get(messages.size() - 1).atNanos;
int id = commandStore == null ? -1 : commandStore.id();
+ // TODO (expected): make this configurable
if (s.length() > 1000)
s = s.substring(0, 1000);
- messages.add(new Message(id, s, prevNanos + 1));
+ messages.add(Message.withAtLeastNanos(-1, id, s, prevNanos + 1));
}
@Override
@@ -140,6 +167,79 @@ public List messages()
{
return Collections.unmodifiableList(messages);
}
+
+ @Override
+ public Tracing send()
+ {
+ return this;
+ }
+
+ public long elapsedNanos()
+ {
+ return elapsedNanos;
+ }
+
+ public long doneAtNanos()
+ {
+ return atNanos + elapsedNanos;
+ }
+
+ public long doneAtMicros()
+ {
+ return idMicros + elapsedNanos/1000;
+ }
+
+ @Override
+ public void done()
+ {
+ elapsedNanos = nanoTime() - atNanos;
+ TracePatternState pattern = parent.owner;
+ if (pattern != null && pattern.bucketMode == SLOWEST && !pattern.updateSlowest(parent, elapsedNanos))
+ return;
+
+ if (index < 0)
+ {
+ parent.parent.txnIdMap.compute(parent.txnId, (id, events) -> {
+ TracePatternState owner = parent.owner;
+ if (owner != null)
+ {
+ synchronized (owner)
+ {
+ owner.detachedEvent.remove(idMicros);
+ }
+ }
+
+ if (events != parent)
+ return events;
+
+ TxnEventsList subList = events.subLists.get(kind);
+ TxnEventsList list = subList == null ? events : subList;
+ if (list.size < events.bucketSize && (subList == null || subList.size < events.bucketSubSize))
+ {
+ if (events.parent.globalCount.admit())
+ events.add(kind, subList, this);
+ return events;
+ }
+
+ for (int i = 0 ; i < list.size ; ++i)
+ {
+ TxnEvent event = list.get(i);
+ if (event.elapsedNanos < elapsedNanos)
+ {
+ events.remove(event.index);
+ events.add(kind, subList, this);
+ break;
+ }
+ }
+ return events;
+ });
+ }
+ }
+
+ public TxnId txnId()
+ {
+ return parent.txnId;
+ }
}
private static class TxnEventsList
@@ -189,11 +289,28 @@ void incrementSeen()
public static class TxnEvents extends TxnEventsList
{
private final EnumMap subLists = new EnumMap<>(CoordinationKind.class);
+
+ final AccordTracing parent;
+ final TxnId txnId;
+
private CoordinationKinds traceEvents = CoordinationKinds.ALL;
private BucketMode mode = LEAKY;
private TracePatternState owner;
private int bucketSize, bucketSubSize;
private float chance = 1.0f;
+ private boolean detached;
+
+ public TxnEvents(AccordTracing parent, TxnId txnId)
+ {
+ this.parent = parent;
+ this.txnId = txnId;
+ }
+
+ private TxnEvents setDetached()
+ {
+ detached = true;
+ return this;
+ }
void remove(int index)
{
@@ -287,7 +404,11 @@ TxnEvent trace(CoordinationKind kind, GlobalCount globalCount)
subList.incrementSeen();
int position = mode.position(bucketSubSize, subList.bucketSeen);
if (position >= bucketSubSize)
+ {
+ if (mode == SLOWEST)
+ return new TxnEvent(this, kind);
return null;
+ }
remove(subList.get(position).index);
}
@@ -296,13 +417,17 @@ else if (bucketSize <= size)
incrementSeen();
int position = mode.position(bucketSize, bucketSeen);
if (position >= bucketSize)
+ {
+ if (mode == SLOWEST)
+ return new TxnEvent(this, kind);
return null;
+ }
remove(position);
}
else
{
- if (!globalCount.admit())
+ if (globalCount != null && !globalCount.admit())
return null;
}
@@ -320,16 +445,21 @@ TxnEvent forceTrace(CoordinationKind kind, GlobalCount globalCount)
}
private TxnEvent newTrace(CoordinationKind kind, TxnEventsList subList)
+ {
+ TxnEvent event = new TxnEvent(this, kind);
+ add(kind, subList, event);
+ return event;
+ }
+
+ private void add(CoordinationKind kind, TxnEventsList subList, TxnEvent event)
{
if (subList == null)
subLists.put(kind, subList = new TxnEventsList());
- TxnEvent event = new TxnEvent(kind);
event.subIndex = subList.size;
subList.addInternal(event);
event.index = size;
addInternal(event);
- return event;
}
public boolean hasOwner()
@@ -370,7 +500,12 @@ public BucketMode bucketMode()
public void forEach(Consumer forEach)
{
for (int i = 0 ; i < size ; ++i)
- forEach.accept(events[i]);
+ {
+ synchronized (events[i])
+ {
+ forEach.accept(events[i]);
+ }
+ }
}
}
@@ -423,7 +558,7 @@ public TracePattern withChance(float chance)
return new TracePattern(kinds, intersects, traceNew, traceFailures, chance);
}
- boolean matches(TxnId txnId, @Nullable Participants> participants, CoordinationKind kind, NewOrFailure newOrFailure)
+ boolean matches(TxnId txnId, @Nullable Routables> participants, CoordinationKind kind, NewOrFailure newOrFailure)
{
if (kinds != null && !kinds.matches(txnId))
return false;
@@ -432,13 +567,33 @@ boolean matches(TxnId txnId, @Nullable Participants> participants, Coordinatio
if (testKind == null || !testKind.contains(kind))
return false;
- if (intersects != null && (participants == null || !intersects.intersects(participants)))
+ if (intersects != null && (participants == null || !participants.intersects(intersects)))
return false;
return chance >= 1.0f || ThreadLocalRandom.current().nextFloat() <= chance;
}
}
+ private static class SlowestTxnId implements Comparable
+ {
+ final TxnId txnId;
+ long elapsedNanos;
+
+ private SlowestTxnId(TxnId txnId, long elapsedNanos)
+ {
+ this.txnId = txnId;
+ this.elapsedNanos = elapsedNanos;
+ }
+
+ @Override
+ public int compareTo(SlowestTxnId that)
+ {
+ int c = Long.compare(this.elapsedNanos, that.elapsedNanos);
+ if (c == 0) c = this.txnId.compareTo(that.txnId);
+ return c;
+ }
+ }
+
public class TracePatternState
{
final int id;
@@ -452,6 +607,10 @@ public class TracePatternState
private CoordinationKinds traceEvents = new CoordinationKinds(false, 0);
private final List txnIds = new ArrayList<>();
+ private final WeakHashMap detachedEvents = new WeakHashMap<>();
+ private final WeakHashMap detachedEvent = new WeakHashMap<>();
+ private final Map slowestLookup = new HashMap<>();
+ private final TreeSet slowest = new TreeSet<>();
public TracePatternState(int id)
{
@@ -479,7 +638,7 @@ public TxnId get(int index)
return txnIds.get(index);
}
- TxnEvents maybeAdd(TxnId txnId, @Nullable Participants> participants, CoordinationKind kind, NewOrFailure newOrFailure)
+ TxnEvents maybeAdd(TxnId txnId, @Nullable Routables> participants, CoordinationKind kind, NewOrFailure newOrFailure)
{
if (!pattern.matches(txnId, participants, kind, newOrFailure))
return null;
@@ -487,37 +646,160 @@ TxnEvents maybeAdd(TxnId txnId, @Nullable Participants> participants, Coordina
return maybeAdd(txnId);
}
- private synchronized TxnEvents maybeAdd(TxnId txnId)
+ private TxnEvents maybeAdd(TxnId txnId)
{
- if (bucketSize == 0)
- return null;
+ class MaybeAdd implements BiFunction
+ {
+ TxnEvents result;
+ TxnId untrace;
- if (++bucketSeen < 0)
- bucketSeen = Integer.MAX_VALUE;
+ @Override
+ public TxnEvents apply(TxnId txnId, TxnEvents in)
+ {
+ synchronized (TracePatternState.this)
+ {
+ if (in != null)
+ return null; // already tracing
+
+ if (bucketSize == 0)
+ return null;
+
+ if (++bucketSeen < 0)
+ bucketSeen = Integer.MAX_VALUE;
+
+ if (bucketSize > txnIds.size())
+ {
+ result = newEvents(txnId);
+ txnIds.add(txnId);
+ if (bucketMode == SLOWEST)
+ initSlowest(result);
+ return result;
+ }
+
+ int position = bucketMode.position(bucketSize, bucketSeen);
+ if (position >= bucketSize)
+ {
+ if (bucketMode == SLOWEST)
+ result = detachedEvents.computeIfAbsent(txnId, id -> newEvents(id).setDetached());
+ return null;
+ }
+
+ result = newEvents(txnId);
+ untrace = txnIds.get(position);
+ txnIds.set(position, txnId);
+ if (bucketMode == SLOWEST)
+ initSlowest(result);
+ return result;
+ }
+ }
+ }
+
+ MaybeAdd maybeAdd = new MaybeAdd();
+ txnIdMap.compute(txnId, maybeAdd);
+ if (maybeAdd.untrace != null)
+ untrace(maybeAdd.untrace);
+ return maybeAdd.result;
+ }
- if (bucketSize > txnIds.size())
+ private boolean initSlowest(TxnEvents events)
+ {
+ SlowestTxnId entry = new SlowestTxnId(events.txnId, Long.MAX_VALUE);
+ if (null != slowestLookup.putIfAbsent(events.txnId, entry))
+ return false;
+ slowest.add(entry);
+ return true;
+ }
+
+ private boolean updateSlowest(TxnEvents events, long elapsedNanos)
+ {
+ synchronized (this)
{
- TxnEvents added = trace(txnId);
- if (added != null)
- txnIds.add(txnId);
- return added;
+ SlowestTxnId entry = slowestLookup.get(events.txnId);
+ if (entry != null)
+ {
+ if (entry.elapsedNanos < elapsedNanos)
+ {
+ slowest.remove(entry);
+ entry.elapsedNanos = elapsedNanos;
+ slowest.add(entry);
+ }
+ return true;
+ }
}
- int position = bucketMode.position(bucketSize, bucketSeen);
+ class UpdateSlowest implements BiFunction
+ {
+ boolean result;
+ TxnId untrace;
- if (position >= bucketSize)
- return null;
+ @Override
+ public TxnEvents apply(TxnId txnId, TxnEvents cur)
+ {
+ if (cur != null && cur != events)
+ return cur;
- TxnEvents added = trace(txnId);
- if (added == null)
- return null;
+ synchronized (TracePatternState.this)
+ {
+ SlowestTxnId entry = slowestLookup.get(events.txnId);
+ if (entry != null)
+ {
+ if (entry.elapsedNanos < elapsedNanos)
+ {
+ slowest.remove(entry);
+ entry.elapsedNanos = elapsedNanos;
+ slowest.add(entry);
+ }
+ result = true;
+ return events;
+ }
+
+ int index = txnIds.size();
+ if (index >= bucketSize && !slowest.isEmpty())
+ {
+ SlowestTxnId leastSlow = slowest.first();
+ if (leastSlow.elapsedNanos >= elapsedNanos)
+ return cur;
+
+ SlowestTxnId removed;
+ removed = slowest.pollFirst();
+ Invariants.expect(leastSlow.equals(removed), "%s != %s", entry, removed);
+ removed = slowestLookup.remove(leastSlow.txnId);
+ Invariants.expect(leastSlow.equals(removed), "%s != %s", entry, removed);
+ index = txnIds.indexOf(leastSlow.txnId);
+ Invariants.expect(index >= 0);
+ if (index < 0)
+ return cur;
+
+ untrace = leastSlow.txnId;
+ txnIds.set(index, events.txnId);
+ }
+ else
+ {
+ if (index > bucketSize)
+ txnIds.remove(0);
+ txnIds.add(events.txnId);
+ }
+
+ if (null != slowestLookup.put(events.txnId, entry = new SlowestTxnId(events.txnId, elapsedNanos)))
+ return cur;
+
+ slowest.add(entry);
+ detachedEvents.remove(entry.txnId);
+ events.detached = false;
+ result = true;
+ return events;
+ }
+ }
+ }
- untrace(txnIds.get(position));
- txnIds.set(position, txnId);
- return added;
+ UpdateSlowest updateSlowest = new UpdateSlowest();
+ txnIdMap.compute(events.txnId, updateSlowest);
+ if (updateSlowest.untrace != null)
+ untrace(updateSlowest.untrace);
+ return updateSlowest.result;
}
- private synchronized void untrace(TxnId txnId)
+ private void untrace(TxnId txnId)
{
txnIdMap.compute(txnId, (ignore, cur) -> {
if (cur == null || cur.owner != this)
@@ -528,16 +810,14 @@ private synchronized void untrace(TxnId txnId)
});
}
- private synchronized TxnEvents trace(TxnId txnId)
+ private TxnEvents newEvents(TxnId txnId)
{
- TxnEvents events = new TxnEvents();
+ TxnEvents events = new TxnEvents(AccordTracing.this, txnId);
events.mode = traceBucketMode;
events.bucketSize = traceBucketSize;
events.bucketSubSize = traceBucketSubSize;
events.owner = this;
- if (null == txnIdMap.putIfAbsent(txnId, events))
- return events;
- return null;
+ return events;
}
synchronized void set(Function pattern, BucketMode newBucketMode, int newBucketSeen, int newBucketSize, BucketMode newTraceBucketMode, int newTraceBucketSize, int newTraceBucketSubSize, CoordinationKinds newTraceEvents)
@@ -545,8 +825,16 @@ synchronized void set(Function pattern, BucketMode n
Invariants.require(newBucketSize != 0);
Invariants.require(newTraceBucketSize != 0);
this.pattern = pattern.apply(this.pattern);
- if (newBucketMode != null)
+ if (newBucketMode != null && newBucketMode != bucketMode)
+ {
+ if (bucketMode == SLOWEST)
+ {
+ slowest.clear();
+ slowestLookup.clear();
+ detachedEvents.clear();
+ }
this.bucketMode = newBucketMode;
+ }
if (newBucketSize >= 0)
this.bucketSize = newBucketSize;
if (newBucketSeen >= 0)
@@ -561,11 +849,19 @@ synchronized void set(Function pattern, BucketMode n
this.traceEvents = newTraceEvents;
}
- synchronized void clear()
+ void clear()
{
- for (TxnId txnId : txnIds)
+ List untrace;
+ synchronized (this)
+ {
+ untrace = new ArrayList<>(txnIds);
+ txnIds.clear();
+ slowest.clear();
+ slowestLookup.clear();
+ detachedEvents.clear();
+ }
+ for (TxnId txnId : untrace)
untrace(txnId);
- txnIds.clear();
}
}
@@ -602,20 +898,54 @@ private static long uniqueNowMicros()
final CopyOnWriteArrayList traceNewPatterns = new CopyOnWriteArrayList<>();
final GlobalCount globalCount = new GlobalCount();
- public Tracing trace(TxnId txnId, @Nullable Participants> participants, CoordinationKind kind)
+ public Tracing trace(TxnId txnId, @Nullable Routables> participants, CoordinationKind kind)
{
if (kind == CoordinationKind.FetchDurableBefore)
return (cs, msg) -> logger.info("Catchup/FetchDurableBefore: {}", msg.length() <= 100 ? msg : msg.substring(0, 100));
- if (!txnIdMap.containsKey(txnId) && null == maybeTrace(txnId, participants, kind, NewOrFailure.NEW, traceNewPatterns))
- return null;
+ if (!txnIdMap.containsKey(txnId))
+ {
+ TxnEvents events = maybeTrace(txnId, participants, kind, NewOrFailure.NEW, traceNewPatterns);
+ if (events == null)
+ return null;
+
+ if (events.detached)
+ {
+ class RegisterDetached implements BiFunction
+ {
+ TxnEvent event;
+
+ @Override
+ public TxnEvents apply(TxnId txnId, TxnEvents state)
+ {
+ if (state != null) event = state.trace(kind, globalCount);
+ else
+ {
+ TracePatternState owner = events.owner;
+ if (owner != null)
+ {
+ synchronized (owner)
+ {
+ event = new TxnEvent(events, kind);
+ owner.detachedEvent.put(event.idMicros, event);
+ }
+ }
+ }
+ return state;
+ }
+ }
+ RegisterDetached register = new RegisterDetached();
+ txnIdMap.compute(txnId, register);
+ return register.event;
+ }
+ }
class Register implements BiFunction
{
TxnEvent event;
@Override
- public TxnEvents apply(TxnId id, TxnEvents state)
+ public TxnEvents apply(TxnId txnId, TxnEvents state)
{
if (state == null)
return null;
@@ -629,6 +959,65 @@ public TxnEvents apply(TxnId id, TxnEvents state)
return register.event;
}
+ public void report(AccordTraceIn in, int nodeId)
+ {
+ txnIdMap.compute(in.txnId, (ignore, events) -> {
+ if (events != null)
+ {
+ report(in, nodeId, events);
+ return events;
+ }
+ else
+ {
+ for (TracePatternState patternState : allPatterns)
+ {
+ synchronized (patternState)
+ {
+ TxnEvent detached = patternState.detachedEvent.get(in.idMicros);
+ if (detached != null)
+ {
+ report(in, nodeId, detached);
+ return null;
+ }
+ }
+ }
+ return null;
+ }
+ });
+
+ }
+
+ private boolean report(AccordTraceIn in, int nodeId, TxnEvents events)
+ {
+ for (int i = 0 ; i < events.size ; ++i)
+ {
+ TxnEvent event = events.get(i);
+ if (event.idMicros == in.idMicros)
+ {
+ report(in, nodeId, event);
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private void report(AccordTraceIn in, int nodeId, TxnEvent event)
+ {
+ synchronized (event)
+ {
+ long atNanos = Math.max(in.atNanos, event.atNanos);
+ int j = event.messages.size();
+ while (j > 0 && event.messages.get(j - 1).atNanos >= atNanos)
+ --j;
+ while (j < event.messages.size() && event.messages.get(j).atNanos == atNanos)
+ {
+ atNanos++;
+ j++;
+ }
+ event.messages.add(j, Message.withExactNanos(nodeId, in.commandStoreId, in.message, atNanos));
+ }
+ }
+
// null values, or values < 0, are ignored
public void set(TxnId txnId, CoordinationKinds trace, BucketMode newBucketMode, int newBucketSize, int newBucketSubSize, int newBucketSeen, float newChance, boolean unsetManagedByPattern)
{
@@ -641,7 +1030,7 @@ public void set(TxnId txnId, CoordinationKinds trace, BucketMode newBucketMode,
if (newBucketSize < 0)
throw new IllegalArgumentException("Must specify bucket size for new trace config.");
- cur = new TxnEvents();
+ cur = new TxnEvents(this, id);
if (newBucketSubSize < 0)
cur.bucketSubSize = newBucketSize;
}
@@ -714,6 +1103,15 @@ public void eraseAllBuckets()
txnIdMap.keySet().forEach(this::stopTracing);
}
+ public void eraseAll()
+ {
+ allPatterns.forEach(TracePatternState::clear);
+ txnIdMap.keySet().forEach(txnId -> {
+ eraseEvents(txnId);
+ stopTracing(txnId);
+ });
+ }
+
public void forEach(Predicate include, ConsumeState forEach)
{
txnIdMap.forEach((txnId, state) -> {
@@ -721,13 +1119,24 @@ public void forEach(Predicate include, ConsumeState forEach)
{
// ensure lock is held for duration of callback
txnIdMap.compute(txnId, (id, cur) -> {
- forEach.accept(txnId, cur);
+ if (cur != null)
+ forEach.accept(txnId, cur);
return cur;
});
}
});
}
+ public void forEach(TxnId txnId, Consumer forEach)
+ {
+ // ensure lock is held for duration of callback
+ txnIdMap.compute(txnId, (id, cur) -> {
+ if (cur != null)
+ forEach.accept(cur);
+ return cur;
+ });
+ }
+
public void setPattern(int id, Function pattern, BucketMode newBucketMode, int newBucketSeen, int newBucketSize, BucketMode newTraceBucketMode, int newTraceBucketSize, int newTraceBucketSubSize, CoordinationKinds newTraceEvents)
{
synchronized (allPatterns)
@@ -815,6 +1224,7 @@ public void onFailed(Throwable failure, TxnId txnId, Participants> participant
if (cur != tracing)
return cur;
+ // TODO (desired): assign an idMicros that is based on coordinationId
TxnEvent event = tracing.forceTrace(coordination.kind(), globalCount);
if (event == null) // we still honour global limit
return cur;
@@ -833,7 +1243,7 @@ public void onFailed(Throwable failure, TxnId txnId, Participants> participant
}
}
- private TxnEvents maybeTrace(TxnId txnId, @Nullable Participants> participants, CoordinationKind kind, NewOrFailure newOrFailure, List patterns)
+ private TxnEvents maybeTrace(TxnId txnId, @Nullable Routables> participants, CoordinationKind kind, NewOrFailure newOrFailure, List patterns)
{
if (patterns.isEmpty())
return null;
diff --git a/src/java/org/apache/cassandra/service/accord/debug/DebugExecution.java b/src/java/org/apache/cassandra/service/accord/debug/DebugExecution.java
new file mode 100644
index 000000000000..a6aa88cd0e19
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/accord/debug/DebugExecution.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.accord.debug;
+
+import java.lang.management.ManagementFactory;
+import java.lang.management.ThreadMXBean;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import accord.local.Command;
+
+import org.apache.cassandra.config.CassandraRelevantProperties;
+import org.apache.cassandra.metrics.LogLinearHistogram;
+import org.apache.cassandra.service.accord.AccordExecutor;
+
+import static org.apache.cassandra.config.CassandraRelevantProperties.DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED;
+import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+
+public class DebugExecution
+{
+ private static final Logger logger = LoggerFactory.getLogger(DebugExecution.class);
+ public static final boolean DEBUG_EXECUTION = CassandraRelevantProperties.ACCORD_DEBUG_EXECUTION.getBoolean(false);
+ private static final long REPORT_MIN_LATENCY_MICROS = 10_000;
+ private static final long REPORT_CPU_RATIO = 2;
+ private static final long REPORT_MAX_LATENCY_MICROS = 50_000;
+
+ // TODO (expected): use sharded histogram so we can report global stats
+ public static class DebugExecutor
+ {
+ public static DebugExecutor maybeDebug() { return DEBUG_EXECUTION ? new DebugExecutor() : null; }
+
+ private DebugExecutor() {}
+
+ final LogLinearHistogram waitingToLock = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram locked = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram sequentialExecutorWaitingToRunLatency = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram sequentialExecutorSetHeadToRunLatency = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram pollToRun = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram applying = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram running = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram cleanup = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+ final LogLinearHistogram taskTotal = new LogLinearHistogram(REPORT_MAX_LATENCY_MICROS);
+
+ long lockedAt, lockedAtCpu;
+ long unlockedAt, unlockedAtCpu;
+
+ public void onEnterLock()
+ {
+ onEnterLock(0);
+ }
+
+ public void onEnterLock(long lockAt)
+ {
+ lockedAt = nanoTime();
+ lockedAtCpu = nowCpu();
+ if (lockAt > 0)
+ {
+ long waitingToLockForMicros = (lockedAt - lockAt)/1000;
+ waitingToLock.increment(waitingToLockForMicros);
+ if (waitingToLockForMicros > REPORT_MAX_LATENCY_MICROS)
+ {
+ report("Took {}us to aquire executor lock", waitingToLockForMicros);
+ }
+ }
+ }
+
+ public void onExitLock()
+ {
+ unlockedAt = nanoTime();
+ unlockedAtCpu = nowCpu();
+ long lockedForMicros = (unlockedAt - lockedAt)/1000;
+ long lockedForCpuMicros = (unlockedAtCpu - lockedAtCpu)/1000;
+ if (lockedForMicros >= REPORT_MAX_LATENCY_MICROS)
+ {
+ report("Held lock for {}us (cpu:{}us)\n", lockedForMicros, lockedForCpuMicros);
+ }
+ else if (lockedForMicros >= REPORT_MIN_LATENCY_MICROS && (lockedForMicros / lockedForCpuMicros) >= REPORT_CPU_RATIO)
+ {
+ report("Held lock for {}us with cpu time only {}us\n", lockedForMicros, lockedForCpuMicros);
+ }
+ locked.increment(lockedForMicros);
+ }
+ }
+
+ public static class DebugExecutorLoop
+ {
+ final DebugExecutor owner;
+ long lockAt;
+
+ public DebugExecutorLoop(DebugExecutor owner)
+ {
+ this.owner = owner;
+ }
+
+ public void onLock()
+ {
+ lockAt = nanoTime();
+ }
+
+ public void onEnterLock()
+ {
+ owner.onEnterLock(lockAt);
+ lockAt = 0;
+ }
+
+ public void onExitLock()
+ {
+ owner.onExitLock();
+ }
+ }
+
+ public static class DebugSequentialExecutor
+ {
+ public static DebugSequentialExecutor maybeDebug(DebugExecutor owner, int commandStoreId)
+ {
+ return DEBUG_EXECUTION ? new DebugSequentialExecutor(owner, commandStoreId) : null;
+ }
+
+ final DebugExecutor owner;
+ final int commandStoreId;
+
+ long setTaskAt, waitingAt;
+ AccordExecutor.Task prev;
+
+ public DebugSequentialExecutor(DebugExecutor owner, int commandStoreId)
+ {
+ this.owner = owner;
+ this.commandStoreId = commandStoreId;
+ }
+
+ public void onSetTask(AccordExecutor.Task next)
+ {
+ if (next == null) setTaskAt = 0;
+ else setTaskAt = nanoTime();
+ }
+
+ public void onComplete(AccordExecutor.Task completed)
+ {
+ long readyAt = setTaskAt;
+ if (waitingAt > setTaskAt)
+ {
+ readyAt = waitingAt;
+ long waitingMicros = (completed.runningAt - waitingAt)/1000;
+ owner.sequentialExecutorWaitingToRunLatency.increment(waitingMicros);
+ if (waitingMicros > REPORT_MAX_LATENCY_MICROS)
+ report("{} spent {}us blocked by a direct execution on queue {}", completed, waitingMicros, commandStoreId);
+ }
+ long atHeadMicros = (completed.runningAt - readyAt)/1000;
+ owner.sequentialExecutorSetHeadToRunLatency.increment(atHeadMicros);
+ if (atHeadMicros > REPORT_MAX_LATENCY_MICROS)
+ {
+ report("{} spent {}us at head of queue {}", completed, atHeadMicros, commandStoreId);
+ }
+ this.prev = completed;
+ }
+
+ public void onWaiting()
+ {
+ waitingAt = nanoTime();
+ }
+ }
+
+ public static class DebugTask
+ {
+ public static final boolean SANITY_CHECK = DTEST_ACCORD_JOURNAL_SANITY_CHECK_ENABLED.getBoolean();
+ private static final boolean DEBUG = DEBUG_EXECUTION || SANITY_CHECK;
+ public static DebugTask maybeDebug() { return DEBUG ? new DebugTask() : null; }
+
+ public List sanityCheck; // for AccordTask only
+ long polledAt, appliedAt, completedAt;
+ long polledAtCpu, completedAtCpu;
+
+ public void onPolled()
+ {
+ polledAt = nanoTime();
+ polledAtCpu = ManagementFactory.getThreadMXBean().getCurrentThreadCpuTime();
+ }
+
+ public void onRunComplete()
+ {
+ appliedAt = nanoTime();
+ }
+
+ public void onCompleted(AccordExecutor.Task task, DebugExecutor owner)
+ {
+ completedAt = nanoTime();
+ completedAtCpu = ManagementFactory.getThreadMXBean().getCurrentThreadCpuTime();
+ if (task.runningAt > 0 && polledAt > 0)
+ {
+ long pollToRunMicros = (task.runningAt - polledAt)/1000;
+ owner.pollToRun.increment(pollToRunMicros);
+ long applyingMicros = -1;
+ if (appliedAt > 0)
+ {
+ applyingMicros = (appliedAt - task.runningAt)/1000;
+ owner.applying.increment(applyingMicros);
+ }
+ long runningMicros = (task.cleanupAt - task.runningAt)/1000;
+ owner.running.increment(runningMicros);
+ long cleanupMicros = (completedAt - task.cleanupAt)/1000;
+ owner.cleanup.increment(cleanupMicros);
+ long totalMicros = (completedAt - polledAt)/1000;
+ owner.taskTotal.increment(totalMicros);
+ long totalCpu = (completedAtCpu - polledAtCpu)/1000;
+ if (totalMicros > REPORT_MAX_LATENCY_MICROS || (totalMicros > REPORT_MIN_LATENCY_MICROS && (totalMicros/totalCpu) >= REPORT_CPU_RATIO))
+ {
+ report("{}: total {}us {}cpu, running {}us{}, cleanup {}us, pollToRun {}us", task, totalMicros, totalCpu,
+ runningMicros, (applyingMicros >= 0 ? ", applying " + applyingMicros + "us" : ""), cleanupMicros, pollToRunMicros);
+ }
+ }
+ }
+ }
+
+ private static void report(String message, Object ... params)
+ {
+ logger.warn(Thread.currentThread() + " " + message, params);
+ }
+
+ private static final ThreadMXBean runtime = ManagementFactory.getThreadMXBean();
+ private static long nowCpu()
+ {
+ return runtime.getCurrentThreadCpuTime();
+ }
+}
diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropApply.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropApply.java
index 688f45192de6..b4fde323655d 100644
--- a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropApply.java
+++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropApply.java
@@ -165,7 +165,7 @@ private synchronized void ack()
// and prevents races where we respond before dispatching all the required reads (if the reads are
// completing faster than the reads can be setup on all required shards)
if (-1 == --waitingOnCount)
- node.reply(replyTo, replyContext, ApplyReply.Applied, null);
+ node.reply(replyTo, replyContext, ApplyReply.Applied, null, tracing());
}
@Override
@@ -175,11 +175,11 @@ protected void acceptInternal(ApplyReply reply, Throwable failure)
{
// Respond with insufficient which should make the coordinator send us the commit
// we need to respond
- node.reply(replyTo, replyContext, reply, failure);
+ node.reply(replyTo, replyContext, reply, failure, tracing());
}
else if (failure != null)
{
- node.reply(replyTo, replyContext, null, failure);
+ node.reply(replyTo, replyContext, null, failure, tracing());
node.agent().onException(failure);
fail();
}
diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java
index 22a0a7932e18..49b467cc7f0a 100644
--- a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java
+++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropExecution.java
@@ -219,7 +219,7 @@ public void sendReadCommand(Message message, InetAddressAndPort to,
// Also this read scope doesn't reflect the contents of this particular read and is larger than it needs to be
// TODO (required): understand interop and whether StableFastPath is appropriate
AccordInteropStableThenRead commit = new AccordInteropStableThenRead(id, allTopologies, txnId, Kind.StableFastPath, executeAt, txn, deps, route, message.payload);
- node.send(id, commit, executor, new AccordInteropRead.ReadCallback(id, to, message, callback, this));
+ node.send(id, commit, executor, new AccordInteropRead.ReadCallback(id, to, message, callback, this), null);
}
@Override
@@ -235,7 +235,7 @@ public void sendReadRepairMutation(Message message, InetAddressAndPort
}
Participants> readScope = Participants.singleton(txn.read().keys().domain(), new TokenKey(message.payload.getTableIds().iterator().next(), message.payload.key().getToken()));
AccordInteropReadRepair readRepair = new AccordInteropReadRepair(id, executes, txnId, readScope, executeAt.epoch(), message.payload);
- node.send(id, readRepair, executor, new AccordInteropReadRepair.ReadRepairCallback(id, to, message, callback, this));
+ node.send(id, readRepair, executor, new AccordInteropReadRepair.ReadRepairCallback(id, to, message, callback, this), null);
}
private List> readChains(Dispatcher.RequestTime requestTime)
@@ -376,7 +376,7 @@ private void sendStableToUncontacted()
{
InetAddressAndPort endpoint = endpointMapper.mappedEndpointOrNull(to);
if (endpoint != null && !contacted.contains(endpoint))
- node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps));
+ node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps), null);
}
}
@@ -387,7 +387,7 @@ public void start()
for (Node.Id to : allTopologies.nodes())
{
if (!executeTopology.contains(to))
- node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps));
+ node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps), null);
}
}
AsyncChain result;
@@ -423,7 +423,7 @@ private AsyncChain executeUnrecoverableRepairUpdate()
// and can be extended similar to MessageType which allows additional types not from Accord to be added
// This commit won't necessarily execute before the interop read repair message so there could be an insufficient which is fine
for (Node.Id to : executeTopology.nodes())
- node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps));
+ node.send(to, new Commit(Kind.StableFastPath, to, allTopologies, txnId, txn, route, Ballot.ZERO, executeAt, deps), null);
repairUpdate.runBRR(AccordInteropExecution.this);
return new TxnData();
});
@@ -455,7 +455,7 @@ public ReadCommand maybeAllowOutOfRangeReads(ReadCommand readCommand, Consistenc
// Provide request callbacks with a way to send maximal commits on Insufficient responses
public void sendMaximalCommit(Id to)
{
- node.send(to, new Commit(Kind.StableWithTxnAndDeps, to, allTopologies, txnId, txn, route, ballot, executeAt, deps));
+ node.send(to, new Commit(Kind.StableWithTxnAndDeps, to, allTopologies, txnId, txn, route, ballot, executeAt, deps), null);
}
public void maybeUpdateUniqueHlc(long uniqueHlc)
diff --git a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropRead.java b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropRead.java
index 504f1187a1ca..101424bd13b9 100644
--- a/src/java/org/apache/cassandra/service/accord/interop/AccordInteropRead.java
+++ b/src/java/org/apache/cassandra/service/accord/interop/AccordInteropRead.java
@@ -245,6 +245,8 @@ public ReadCallback(Node.Id id, InetAddressAndPort endpoint, Message> message,
@Override
ReadResponse convertResponse(ReadOk ok)
{
+ LocalReadData data = (LocalReadData) ok.data;
+ data.ensureRemoteResponse();
return ((LocalReadData)ok.data).remoteResponse;
}
}
diff --git a/src/java/org/apache/cassandra/service/accord/journal/AccordJournal.java b/src/java/org/apache/cassandra/service/accord/journal/AccordJournal.java
index f10317f35235..425aa0f4f202 100644
--- a/src/java/org/apache/cassandra/service/accord/journal/AccordJournal.java
+++ b/src/java/org/apache/cassandra/service/accord/journal/AccordJournal.java
@@ -41,6 +41,8 @@
import accord.local.CommandStores;
import accord.local.CommandStores.RangesForEpoch;
import accord.local.DurableBefore;
+import accord.local.MinimalCommand;
+import accord.local.MinimalCommand.MinimalWithDeps;
import accord.local.Node;
import accord.local.RedundantBefore;
import accord.primitives.EpochSupplier;
@@ -50,8 +52,8 @@
import accord.utils.Invariants;
import accord.utils.PersistentField;
-import org.apache.cassandra.config.AccordSpec.JournalSpec;
-import org.apache.cassandra.config.AccordSpec.JournalSpec.ReplayMode;
+import org.apache.cassandra.config.AccordConfig.JournalConfig;
+import org.apache.cassandra.config.AccordConfig.JournalConfig.ReplayMode;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.db.marshal.LongType;
@@ -77,7 +79,7 @@
import static accord.api.Journal.Load.MINIMAL;
import static accord.api.Journal.Load.MINIMAL_WITH_DEPS;
import static accord.local.Cleanup.Input.FULL;
-import static org.apache.cassandra.config.AccordSpec.RangeIndexMode.journal_sai;
+import static org.apache.cassandra.config.AccordConfig.RangeIndexMode.journal_sai;
import static org.apache.cassandra.config.DatabaseDescriptor.getAccord;
import static org.apache.cassandra.config.DatabaseDescriptor.getAccordJournalDirectory;
import static org.apache.cassandra.service.accord.JournalKey.Type.COMMAND_DIFF;
@@ -230,14 +232,14 @@ public List debugCommand(int commandStoreId, TxnId txnId)
}
@Override
- public Command.Minimal loadMinimal(int commandStoreId, TxnId txnId, RedundantBefore redundantBefore, DurableBefore durableBefore)
+ public MinimalCommand loadMinimal(int commandStoreId, TxnId txnId, RedundantBefore redundantBefore, DurableBefore durableBefore)
{
CommandChanges builder = CommandChanges.cleanupAndFilter(loadDiffs(commandStoreId, txnId, MINIMAL), redundantBefore, durableBefore);
return builder == null ? null : builder.asMinimal();
}
@Override
- public Command.MinimalWithDeps loadMinimalWithDeps(int commandStoreId, TxnId txnId, RedundantBefore redundantBefore, DurableBefore durableBefore)
+ public MinimalWithDeps loadMinimalWithDeps(int commandStoreId, TxnId txnId, RedundantBefore redundantBefore, DurableBefore durableBefore)
{
CommandChanges builder = CommandChanges.cleanupAndFilter(loadDiffs(commandStoreId, txnId, MINIMAL_WITH_DEPS), redundantBefore, durableBefore);
return builder == null ? null : builder.asMinimalWithDeps();
@@ -596,8 +598,8 @@ public void replay(CommandStore commandStore, ReplayMode replayMode, long minSeg
@Override
public boolean replay(CommandStores commandStores, Object param)
{
- ReplayMode mode = params instanceof JournalSpec ? ((JournalSpec)params).replay
- : getAccord().journal.replay;
+ ReplayMode mode = params instanceof JournalConfig ? ((JournalConfig)params).replay
+ : getAccord().journal.replay;
return Replay.replay(this, mode, commandStores.all(), param);
}
diff --git a/src/java/org/apache/cassandra/service/accord/journal/Replay.java b/src/java/org/apache/cassandra/service/accord/journal/Replay.java
index ce840ccc275e..5b5ac1b6c3e6 100644
--- a/src/java/org/apache/cassandra/service/accord/journal/Replay.java
+++ b/src/java/org/apache/cassandra/service/accord/journal/Replay.java
@@ -36,7 +36,7 @@
import accord.utils.Invariants;
import accord.utils.UnhandledEnum;
-import org.apache.cassandra.config.AccordSpec.JournalSpec.ReplayMode;
+import org.apache.cassandra.config.AccordConfig.JournalConfig.ReplayMode;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.journal.Journal;
import org.apache.cassandra.service.accord.AccordCommandStore;
diff --git a/src/java/org/apache/cassandra/service/accord/serializers/CommandStoreSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/CommandStoreSerializers.java
index ddcaec9071b4..0b3a75f90cb6 100644
--- a/src/java/org/apache/cassandra/service/accord/serializers/CommandStoreSerializers.java
+++ b/src/java/org/apache/cassandra/service/accord/serializers/CommandStoreSerializers.java
@@ -39,7 +39,6 @@
import accord.local.MaxConflicts;
import accord.local.MaxDecidedRX;
import accord.local.RedundantBefore;
-import accord.local.RejectBefore;
import accord.primitives.Range;
import accord.primitives.Ranges;
import accord.primitives.SaveStatus;
@@ -77,7 +76,7 @@ public class CommandStoreSerializers
public static final UnversionedSerializer maxDecidedRX = new ReducingRangeMapSerializer<>(new DecidedRXSerializer(), MaxDecidedRX.DecidedRX[]::new, MaxDecidedRX.SerializerSupport::create, MaxDecidedRX.EMPTY);
public static final UnversionedSerializer redundantBeforeShortBounds = new RedundantBeforeShortBoundsSerializer();
public static final UnversionedSerializer redundantBefore = new ReducingRangeMapSerializer<>(redundantBeforeShortBounds, RedundantBefore.Bounds[]::new, RedundantBefore.SerializerSupport::create, RedundantBefore.EMPTY);
- public static final UnversionedSerializer rejectBefore = new ReducingRangeMapSerializer<>(CommandSerializers.timestamp, Timestamp[]::new, RejectBefore.SerializerSupport::create, RejectBefore.EMPTY);
+ public static final UnversionedSerializer rejectBefore = new RejectBeforeSerializer();
public static final UnversionedSerializer> bootstrapBeganAt = new TimestampToRangesMapSerializer<>(CommandSerializers.txnId);
public static final UnversionedSerializer> safeToRead = new TimestampToRangesMapSerializer<>(CommandSerializers.timestamp);
public static final UnversionedSerializer txnListener = new TxnListenerSerializer();
@@ -578,14 +577,15 @@ public long serializedSize(Map map)
private static final class MaxConflictsSerializer extends BTreeReducingRangeMapSerializer
{
// use top bits of a single byte vint, to leave room for base impl to fill other way
- private static final int SEPARATE_WRITES = 0x40;
+ private static final int INCLUDES_WRITE = 0x40;
+ private static final int INCLUDES_REJECT = 0x20;
private MaxConflictsSerializer() {}
@Override
protected int mapFlags()
{
- return SEPARATE_WRITES;
+ return INCLUDES_WRITE | INCLUDES_REJECT;
}
@Override
@@ -605,30 +605,78 @@ void serializeWithoutRange(MaxConflicts.Entry entry, DataOutputPlus out) throws
{
CommandSerializers.timestamp.serialize(entry.any, out);
CommandSerializers.timestamp.serialize(entry.write, out);
+ CommandSerializers.timestamp.serialize(entry.reject, out);
}
@Override
long serializedSizeWithoutRange(MaxConflicts.Entry entry)
{
return CommandSerializers.timestamp.serializedSize(entry.any)
- + CommandSerializers.timestamp.serializedSize(entry.write);
+ + CommandSerializers.timestamp.serializedSize(entry.write)
+ + CommandSerializers.timestamp.serializedSize(entry.reject);
}
@Override
MaxConflicts.Entry deserialize(RoutingKey start, RoutingKey end, DataInputPlus in, int mapFlags) throws IOException
{
- Timestamp all = CommandSerializers.timestamp.deserialize(in);
- Timestamp writes = all;
- if ((mapFlags & SEPARATE_WRITES) != 0)
- writes = CommandSerializers.timestamp.deserialize(in);
- return new MaxConflicts.Entry(start, end, all, writes);
+ Timestamp any = CommandSerializers.timestamp.deserialize(in);
+ Timestamp write = any;
+ if ((mapFlags & INCLUDES_WRITE) != 0)
+ write = CommandSerializers.timestamp.deserialize(in);
+ Timestamp reject = Timestamp.NONE;
+ if ((mapFlags & INCLUDES_REJECT) != 0)
+ reject = CommandSerializers.timestamp.deserialize(in);
+ return MaxConflicts.Entry.create(start, end, any, write, reject);
}
@Override
MaxConflicts.Entry deserializeArrayModeWithoutRange(DataInputPlus in) throws IOException
{
Timestamp all = CommandSerializers.timestamp.deserialize(in);
- return new MaxConflicts.Entry(all, all);
+ return MaxConflicts.Entry.create(all, all, Timestamp.NONE);
+ }
+ }
+
+ // legacy version that deserializes RejectsBefore into MaxConflicts
+ private static final class RejectBeforeSerializer extends BTreeReducingRangeMapSerializer
+ {
+ private RejectBeforeSerializer() {}
+
+ @Override
+ MaxConflicts empty()
+ {
+ return MaxConflicts.EMPTY;
+ }
+
+ @Override
+ BTreeReducingRangeMap.Builder builder()
+ {
+ return new MaxConflicts.Builder();
+ }
+
+ @Override
+ void serializeWithoutRange(MaxConflicts.Entry entry, DataOutputPlus out)
+ {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ long serializedSizeWithoutRange(MaxConflicts.Entry entry)
+ {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ MaxConflicts.Entry deserialize(RoutingKey start, RoutingKey end, DataInputPlus in, int mapFlags) throws IOException
+ {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ MaxConflicts.Entry deserializeArrayModeWithoutRange(DataInputPlus in) throws IOException
+ {
+ Timestamp reject = CommandSerializers.timestamp.deserialize(in);
+ return MaxConflicts.Entry.create(Timestamp.NONE, Timestamp.NONE, reject);
}
}
diff --git a/src/java/org/apache/cassandra/service/accord/serializers/RemoteSuccessSerializers.java b/src/java/org/apache/cassandra/service/accord/serializers/RemoteSuccessSerializers.java
new file mode 100644
index 000000000000..8a66118023e1
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/accord/serializers/RemoteSuccessSerializers.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service.accord.serializers;
+
+import java.io.IOException;
+
+import accord.messages.RemoteSuccess;
+import accord.primitives.TxnId;
+
+import org.apache.cassandra.io.VersionedSerializer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.service.accord.txn.TxnData;
+
+public class RemoteSuccessSerializers
+{
+ public static final VersionedSerializer remoteSuccess = new VersionedSerializer<>()
+ {
+ @Override
+ public void serialize(RemoteSuccess msg, DataOutputPlus out, Version version) throws IOException
+ {
+ CommandSerializers.txnId.serialize(msg.txnId, out);
+ TxnData.serializer.serialize((TxnData) msg.result, out, version);
+ }
+
+ @Override
+ public RemoteSuccess deserialize(DataInputPlus in, Version version) throws IOException
+ {
+ TxnId txnId = CommandSerializers.txnId.deserialize(in);
+ TxnData data = TxnData.serializer.deserialize(in, version);
+ return new RemoteSuccess(txnId, data);
+ }
+
+ @Override
+ public long serializedSize(RemoteSuccess msg, Version version)
+ {
+ return CommandSerializers.txnId.serializedSize(msg.txnId)
+ + TxnData.serializer.serializedSize((TxnData) msg.result, version);
+ }
+ };
+}
diff --git a/src/java/org/apache/cassandra/service/accord/topology/AccordTopologyService.java b/src/java/org/apache/cassandra/service/accord/topology/AccordTopologyService.java
index d63fab19ae91..9851ab6a5674 100644
--- a/src/java/org/apache/cassandra/service/accord/topology/AccordTopologyService.java
+++ b/src/java/org/apache/cassandra/service/accord/topology/AccordTopologyService.java
@@ -20,6 +20,7 @@
import java.util.HashSet;
import java.util.Set;
+import java.util.concurrent.TimeUnit;
import java.util.function.BiConsumer;
import javax.annotation.concurrent.GuardedBy;
@@ -42,6 +43,8 @@
import org.apache.cassandra.concurrent.ScheduledExecutorPlus;
import org.apache.cassandra.concurrent.ScheduledExecutors;
import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.config.AccordConfig;
+import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.locator.InetAddressAndPort;
import org.apache.cassandra.net.MessageDelivery;
import org.apache.cassandra.net.MessagingService;
@@ -94,10 +97,40 @@ public void onStartup(Node node)
previouslyRemovedIds = removed;
}
node.topology().addListener(watermarkCollector);
- node.topology().addListener(syncPropagator);
- syncPropagator.onNodesRemoved(removed);
+ if (DatabaseDescriptor.getAccord().topology_sync_propagator_enabled_pre_start)
+ {
+ node.topology().addListener(syncPropagator);
+ syncPropagator.onNodesRemoved(removed);
+ }
+ }
+
+ public void afterStartup(Node node)
+ {
+ AccordConfig config = DatabaseDescriptor.getAccord();
+ if (config.topology_sync_propagator_enabled_post_startup && !config.topology_sync_propagator_enabled_pre_start)
+ {
+ node.topology().addListener(syncPropagator);
+ syncPropagator.onNodesRemoved(previouslyRemovedIds);
+ }
+
+ long watermarkIntervalNanos = config.topology_watermark_interval.toNanoseconds();
+ if (watermarkIntervalNanos > 0)
+ fetchAndReportWatermarksRecurring(node, watermarkIntervalNanos);
+ }
+
+ public void fetchAndReportWatermarksRecurring(Node node, long intervalNanos)
+ {
+ if (state == State.SHUTDOWN)
+ return;
+
+ ScheduledExecutors.scheduledFastTasks.scheduleSelfRecurring(() -> {
+ WatermarkCollector.fetchAndReportWatermarksAsync(node.topology()).addCallback((success, fail) -> {
+ fetchAndReportWatermarksRecurring(node, intervalNanos);
+ });
+ }, intervalNanos, TimeUnit.NANOSECONDS);
}
+
public synchronized void shutdown()
{
state = State.SHUTDOWN;
diff --git a/src/java/org/apache/cassandra/service/accord/topology/FastPathStrategy.java b/src/java/org/apache/cassandra/service/accord/topology/FastPathStrategy.java
index 4757785a6472..cdde1226412c 100644
--- a/src/java/org/apache/cassandra/service/accord/topology/FastPathStrategy.java
+++ b/src/java/org/apache/cassandra/service/accord/topology/FastPathStrategy.java
@@ -44,7 +44,7 @@ public interface FastPathStrategy
{
enum Kind
{
- SIMPLE, PARAMETERIZED, INHERIT_KEYSPACE;
+ SIMPLE, PARAMETERIZED, INHERIT_KEYSPACE, UP;
static final String KEY = "kind";
private static final Map LOOKUP;
@@ -54,6 +54,7 @@ enum Kind
builder.put(SIMPLE.name(), SIMPLE);
builder.put(PARAMETERIZED.name(), PARAMETERIZED);
builder.put(INHERIT_KEYSPACE.name(), INHERIT_KEYSPACE);
+ builder.put(UP.name(), UP);
LOOKUP = builder.build();
}
@@ -115,6 +116,8 @@ static FastPathStrategy fromMap(Map