From e47fb3b5788b9bb1e497f1f634cfa05388d57067 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 9 Feb 2026 22:40:59 -0500 Subject: [PATCH 1/9] feat: add ExtractLeafExpressions optimizer rule for get_field pushdown --- .../optimizer/src/extract_leaf_expressions.rs | 1464 +++++++++++++++-- datafusion/optimizer/src/optimizer.rs | 3 + .../sqllogictest/test_files/explain.slt | 8 + .../sqllogictest/test_files/projection.slt | 2 +- .../test_files/projection_pushdown.slt | 468 +++--- .../test_files/push_down_filter.slt | 9 +- datafusion/sqllogictest/test_files/struct.slt | 4 +- datafusion/sqllogictest/test_files/unnest.slt | 2 +- 8 files changed, 1637 insertions(+), 323 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 151bca8278836..eede292c1a367 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -15,23 +15,28 @@ // specific language governing permissions and limitations // under the License. -//! NB: This module is a work in progress. -//! We merged it early in -//! with the skeleton and snapshots matching the current state, -//! but the actual implementation is pending further development. -//! There may be comments or code that are incomplete or inaccurate. //! Two-pass optimizer pipeline that pushes cheap expressions (like struct field //! access `user['status']`) closer to data sources, enabling early data reduction //! and source-level optimizations (e.g., Parquet column pruning). See //! [`ExtractLeafExpressions`] (pass 1) and [`PushDownLeafProjections`] (pass 2). -use datafusion_common::Result; -use datafusion_common::tree_node::Transformed; +use indexmap::{IndexMap, IndexSet}; +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion_common::alias::AliasGenerator; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion_common::{Column, DFSchema, Result, qualified_name}; use datafusion_expr::logical_plan::LogicalPlan; +use datafusion_expr::{Expr, ExpressionPlacement, Projection}; use crate::optimizer::ApplyOrder; +use crate::push_down_filter::replace_cols_by_name; +use crate::utils::has_all_column_refs; use crate::{OptimizerConfig, OptimizerRule}; +const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted"; + /// Extracts `MoveTowardsLeafNodes` sub-expressions from non-projection nodes /// into **extraction projections** (pass 1 of 2). /// @@ -72,7 +77,8 @@ use crate::{OptimizerConfig, OptimizerRule}; /// ``` /// /// **Important:** The `PushDownFilter` rule is aware of projections created by this rule -/// and will not push filters through them. See `is_extracted_expr_projection` in utils.rs. +/// and will not push filters through them. It uses `ExpressionPlacement` to detect +/// `MoveTowardsLeafNodes` expressions and skip filter pushdown past them. #[derive(Default, Debug)] pub struct ExtractLeafExpressions {} @@ -95,9 +101,443 @@ impl OptimizerRule for ExtractLeafExpressions { fn rewrite( &self, plan: LogicalPlan, - _config: &dyn OptimizerConfig, + config: &dyn OptimizerConfig, ) -> Result> { - Ok(Transformed::no(plan)) + let alias_generator = config.alias_generator(); + extract_from_plan(plan, alias_generator) + } +} + +/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. +/// +/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes +/// like Join, each extracted sub-expression is routed to the correct input +/// by checking which input's schema contains all of the expression's column +/// references. +fn extract_from_plan( + plan: LogicalPlan, + alias_generator: &Arc, +) -> Result> { + // Only extract from plan types whose output schema is predictable after + // expression rewriting. Nodes like Window derive column names from + // their expressions, so rewriting `get_field` inside a window function + // changes the output schema and breaks the recovery projection. + if !matches!( + &plan, + LogicalPlan::Aggregate(_) + | LogicalPlan::Filter(_) + | LogicalPlan::Sort(_) + | LogicalPlan::Limit(_) + | LogicalPlan::Join(_) + ) { + return Ok(Transformed::no(plan)); + } + + let inputs = plan.inputs(); + if inputs.is_empty() { + return Ok(Transformed::no(plan)); + } + + // Save original output schema before any transformation + let original_schema = Arc::clone(plan.schema()); + + // Clone inputs upfront (before plan is consumed by map_expressions) + let owned_inputs: Vec = inputs.into_iter().cloned().collect(); + + // Build per-input schemas (kept alive for extractor borrows) + let input_schemas: Vec> = owned_inputs + .iter() + .map(|i| Arc::clone(i.schema())) + .collect(); + + // Build per-input extractors + let mut extractors: Vec = input_schemas + .iter() + .map(|schema| LeafExpressionExtractor::new(schema.as_ref(), alias_generator)) + .collect(); + + // Build per-input column sets for routing expressions to the correct input + let input_column_sets: Vec> = input_schemas + .iter() + .map(|schema| schema_columns(schema.as_ref())) + .collect(); + + // Transform expressions via map_expressions with routing + let transformed = plan.map_expressions(|expr| { + routing_extract(expr, &mut extractors, &input_column_sets) + })?; + + // If no expressions were rewritten, nothing was extracted + if !transformed.transformed { + return Ok(transformed); + } + + // Build per-input extraction projections (None means no extractions for that input) + let new_inputs: Vec = owned_inputs + .iter() + .zip(extractors.iter()) + .map(|(input, extractor)| { + let input_arc = Arc::new(input.clone()); + Ok(extractor + .build_extraction_projection(&input_arc)? + .unwrap_or_else(|| input.clone())) + }) + .collect::>>()?; + + // Rebuild and add recovery projection if schema changed + let new_plan = transformed + .data + .with_new_exprs(transformed.data.expressions(), new_inputs)?; + + // Add recovery projection if the output schema changed + let recovered = build_recovery_projection(original_schema.as_ref(), new_plan)?; + + Ok(Transformed::yes(recovered)) +} + +/// Given an expression, returns the index of the input whose columns fully +/// cover the expression's column references. +/// Returns `None` if the expression references columns from multiple inputs. +fn find_owning_input( + expr: &Expr, + input_column_sets: &[std::collections::HashSet], +) -> Option { + input_column_sets + .iter() + .position(|cols| has_all_column_refs(expr, cols)) +} + +/// Walks an expression tree top-down, extracting `MoveTowardsLeafNodes` +/// sub-expressions and routing each to the correct per-input extractor. +fn routing_extract( + expr: Expr, + extractors: &mut [LeafExpressionExtractor], + input_column_sets: &[std::collections::HashSet], +) -> Result> { + expr.transform_down(|e| { + // Skip expressions already aliased with extracted expression pattern + if let Expr::Alias(alias) = &e + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + return Ok(Transformed { + data: e, + transformed: false, + tnr: TreeNodeRecursion::Jump, + }); + } + + // Don't extract Alias nodes directly — preserve the alias and let + // transform_down recurse into the inner expression + if matches!(&e, Expr::Alias(_)) { + return Ok(Transformed::no(e)); + } + + match e.placement() { + ExpressionPlacement::MoveTowardsLeafNodes => { + if let Some(idx) = find_owning_input(&e, input_column_sets) { + let col_ref = extractors[idx].add_extracted(e)?; + Ok(Transformed::yes(col_ref)) + } else { + // References columns from multiple inputs — cannot extract + Ok(Transformed::no(e)) + } + } + ExpressionPlacement::Column => { + // Track columns that the parent node references so the + // extraction projection includes them as pass-through. + // Without this, the extraction projection would only + // contain __extracted_N aliases, and the parent couldn't + // resolve its other column references. + if let Expr::Column(col) = &e + && let Some(idx) = find_owning_input(&e, input_column_sets) + { + extractors[idx].columns_needed.insert(col.clone()); + } + Ok(Transformed::no(e)) + } + _ => Ok(Transformed::no(e)), + } + }) +} + +/// Returns all columns in the schema (both qualified and unqualified forms) +fn schema_columns(schema: &DFSchema) -> std::collections::HashSet { + schema + .iter() + .flat_map(|(qualifier, field)| { + [ + Column::new(qualifier.cloned(), field.name()), + Column::new_unqualified(field.name()), + ] + }) + .collect() +} + +// ============================================================================= +// Helper Functions for Extraction Targeting +// ============================================================================= + +/// Build a replacement map from a projection: output_column_name -> underlying_expr. +/// +/// This is used to resolve column references through a renaming projection. +/// For example, if a projection has `user AS x`, this maps `x` -> `col("user")`. +fn build_projection_replace_map(projection: &Projection) -> HashMap { + projection + .schema + .iter() + .zip(projection.expr.iter()) + .map(|((qualifier, field), expr)| { + let key = Column::from((qualifier, field)).flat_name(); + (key, expr.clone().unalias()) + }) + .collect() +} + +/// Build a recovery projection to restore the original output schema. +/// +/// After extraction, a node's output schema may differ from the original: +/// +/// - **Schema-preserving nodes** (Filter/Sort/Limit): the extraction projection +/// below adds extra `__extracted_N` columns that bubble up through the node. +/// Recovery selects only the original columns to hide the extras. +/// ```text +/// Original schema: [id, user] +/// After extraction: [__extracted_1, id, user] ← extra column leaked through +/// Recovery: SELECT id, user FROM ... ← hides __extracted_1 +/// ``` +/// +/// - **Schema-defining nodes** (Aggregate): same number of columns but names +/// may differ because extracted aliases replaced the original expressions. +/// Recovery maps positionally, aliasing where names changed. +/// ```text +/// Original: [SUM(user['balance'])] +/// After: [SUM(__extracted_1)] ← name changed +/// Recovery: SUM(__extracted_1) AS "SUM(user['balance'])" +/// ``` +/// +/// - **Schemas identical** → no recovery projection needed. +fn build_recovery_projection( + original_schema: &DFSchema, + input: LogicalPlan, +) -> Result { + let new_schema = input.schema(); + let orig_len = original_schema.fields().len(); + let new_len = new_schema.fields().len(); + + if orig_len == new_len { + // Same number of fields — check if schemas are identical + let schemas_match = original_schema.iter().zip(new_schema.iter()).all( + |((orig_q, orig_f), (new_q, new_f))| { + orig_f.name() == new_f.name() && orig_q == new_q + }, + ); + if schemas_match { + return Ok(input); + } + + // Schema-defining nodes (Projection, Aggregate): names may differ at some positions. + // Map positionally, aliasing where the name changed. + let mut proj_exprs = Vec::with_capacity(orig_len); + for (i, (orig_qualifier, orig_field)) in original_schema.iter().enumerate() { + let (new_qualifier, new_field) = new_schema.qualified_field(i); + if orig_field.name() == new_field.name() && orig_qualifier == new_qualifier { + proj_exprs.push(Expr::from((orig_qualifier, orig_field))); + } else { + let new_col = Expr::Column(Column::from((new_qualifier, new_field))); + proj_exprs.push( + new_col.alias_qualified(orig_qualifier.cloned(), orig_field.name()), + ); + } + } + let projection = Projection::try_new(proj_exprs, Arc::new(input))?; + Ok(LogicalPlan::Projection(projection)) + } else { + // Schema-preserving nodes: new schema has extra extraction columns. + // Original columns still exist by name; select them to hide extras. + let col_exprs: Vec = original_schema.iter().map(Expr::from).collect(); + let projection = Projection::try_new(col_exprs, Arc::new(input))?; + Ok(LogicalPlan::Projection(projection)) + } +} + +/// Collects `MoveTowardsLeafNodes` sub-expressions found during expression +/// tree traversal and can build an extraction projection from them. +/// +/// # Example +/// +/// Given `Filter: user['status'] = 'active' AND user['name'] IS NOT NULL`: +/// - `add_extracted(user['status'])` → stores it, returns `col("__extracted_1")` +/// - `add_extracted(user['name'])` → stores it, returns `col("__extracted_2")` +/// - `build_extraction_projection()` produces: +/// `Projection: user['status'] AS __extracted_1, user['name'] AS __extracted_2, ` +struct LeafExpressionExtractor<'a> { + /// Extracted expressions: maps expression -> alias + extracted: IndexMap, + /// Columns referenced by extracted expressions or the parent node, + /// included as pass-through in the extraction projection. + columns_needed: IndexSet, + /// Input schema + input_schema: &'a DFSchema, + /// Alias generator + alias_generator: &'a Arc, +} + +impl<'a> LeafExpressionExtractor<'a> { + fn new(input_schema: &'a DFSchema, alias_generator: &'a Arc) -> Self { + Self { + extracted: IndexMap::new(), + columns_needed: IndexSet::new(), + input_schema, + alias_generator, + } + } + + /// Adds an expression to extracted set, returns column reference. + fn add_extracted(&mut self, expr: Expr) -> Result { + // Deduplication: reuse existing alias if same expression + if let Some(alias) = self.extracted.get(&expr) { + return Ok(Expr::Column(Column::new_unqualified(alias))); + } + + // Track columns referenced by this expression + for col in expr.column_refs() { + self.columns_needed.insert(col.clone()); + } + + // Generate unique alias + let alias = self.alias_generator.next(EXTRACTED_EXPR_PREFIX); + self.extracted.insert(expr, alias.clone()); + + Ok(Expr::Column(Column::new_unqualified(&alias))) + } + + /// Builds a fresh extraction projection above the given input. + /// + /// Returns `None` if there are no extractions. Otherwise creates a new + /// projection that includes extracted expressions (aliased) plus all + /// input schema columns for pass-through. + fn build_extraction_projection( + &self, + input: &Arc, + ) -> Result> { + if self.extracted.is_empty() { + return Ok(None); + } + let mut proj_exprs = Vec::new(); + for (expr, alias) in self.extracted.iter() { + proj_exprs.push(expr.clone().alias(alias)); + } + for (qualifier, field) in self.input_schema.iter() { + proj_exprs.push(Expr::from((qualifier, field))); + } + Ok(Some(LogicalPlan::Projection(Projection::try_new( + proj_exprs, + Arc::clone(input), + )?))) + } +} + +/// Build an extraction projection above the target node. +/// +/// If the target is an existing projection, merges into it. This requires +/// resolving column references through the projection's rename mapping: +/// if the projection has `user AS u`, and an extracted expression references +/// `u['name']`, we must rewrite it to `user['name']` since the merged +/// projection reads from the same input as the original. +/// +/// Deduplicates by resolved expression equality and adds pass-through +/// columns as needed. Otherwise builds a fresh projection with extracted +/// expressions + ALL input schema columns. +fn build_extraction_projection_impl( + extracted_exprs: &[(Expr, String)], + columns_needed: &IndexSet, + target: &Arc, + target_schema: &DFSchema, +) -> Result { + if let LogicalPlan::Projection(existing) = target.as_ref() { + // Merge into existing projection + let mut proj_exprs = existing.expr.clone(); + + // Build a map of existing expressions (by Expr equality) to their aliases + let existing_extractions: IndexMap = existing + .expr + .iter() + .filter_map(|e| { + if let Expr::Alias(alias) = e + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + return Some((*alias.expr.clone(), alias.name.clone())); + } + None + }) + .collect(); + + // Resolve column references through the projection's rename mapping + let replace_map = build_projection_replace_map(existing); + + // Add new extracted expressions, resolving column refs through the projection + for (expr, alias) in extracted_exprs { + let resolved = replace_cols_by_name(expr.clone().alias(alias), &replace_map)?; + let resolved_inner = if let Expr::Alias(a) = &resolved { + a.expr.as_ref() + } else { + &resolved + }; + if let Some(existing_alias) = existing_extractions.get(resolved_inner) { + // Same expression already extracted under a different alias — + // add the expression with the new alias so both names are + // available in the output. We can't reference the existing alias + // as a column within the same projection, so we duplicate the + // computation. + if existing_alias != alias { + proj_exprs.push(resolved); + } + } else { + proj_exprs.push(resolved); + } + } + + // Add any new pass-through columns that aren't already in the projection. + // We check against existing.input.schema() (the projection's source) rather + // than target_schema (the projection's output) because columns produced + // by alias expressions (e.g., CSE's __common_expr_N) exist in the output but + // not the input, and cannot be added as pass-through Column references. + let existing_cols: IndexSet = existing + .expr + .iter() + .filter_map(|e| { + if let Expr::Column(c) = e { + Some(c.clone()) + } else { + None + } + }) + .collect(); + + let input_schema = existing.input.schema(); + for col in columns_needed { + let col_expr = Expr::Column(col.clone()); + let resolved = replace_cols_by_name(col_expr, &replace_map)?; + if let Expr::Column(resolved_col) = &resolved + && !existing_cols.contains(resolved_col) + && input_schema.has_column(resolved_col) + { + proj_exprs.push(Expr::Column(resolved_col.clone())); + } + // If resolved to non-column expr, it's already computed by existing projection + } + + Projection::try_new(proj_exprs, Arc::clone(&existing.input)) + } else { + // Build new projection with extracted expressions + all input columns + let mut proj_exprs = Vec::new(); + for (expr, alias) in extracted_exprs { + proj_exprs.push(expr.clone().alias(alias)); + } + for (qualifier, field) in target_schema.iter() { + proj_exprs.push(Expr::from((qualifier, field))); + } + Projection::try_new(proj_exprs, Arc::clone(target)) } } @@ -155,10 +595,507 @@ impl OptimizerRule for PushDownLeafProjections { fn rewrite( &self, plan: LogicalPlan, - _config: &dyn OptimizerConfig, + config: &dyn OptimizerConfig, ) -> Result> { - Ok(Transformed::no(plan)) + let alias_generator = config.alias_generator(); + match try_push_input(&plan, alias_generator)? { + Some(new_plan) => Ok(Transformed::yes(new_plan)), + None => Ok(Transformed::no(plan)), + } + } +} + +/// Attempts to push a projection's extractable expressions further down. +/// +/// Returns `Some(new_subtree)` if the projection was pushed down or merged, +/// `None` if there is nothing to push or the projection sits above a barrier. +fn try_push_input( + input: &LogicalPlan, + alias_generator: &Arc, +) -> Result> { + let LogicalPlan::Projection(proj) = input else { + return Ok(None); + }; + split_and_push_projection(proj, alias_generator) +} + +/// Splits a projection into extractable pieces, pushes them towards leaf +/// nodes, and adds a recovery projection if needed. +/// +/// Handles both: +/// - **Pure extraction projections** (all `__extracted` aliases + columns) +/// - **Mixed projections** (containing `MoveTowardsLeafNodes` sub-expressions) +/// +/// Returns `Some(new_subtree)` if extractions were pushed down, +/// `None` if there is nothing to extract or push. +/// +/// # Example: Mixed Projection +/// +/// ```text +/// Input plan: +/// Projection: user['name'] IS NOT NULL AS has_name, id +/// Filter: ... +/// TableScan +/// +/// Phase 1 (Split): +/// extraction_pairs: [(user['name'], "__extracted_1")] +/// recovery_exprs: [__extracted_1 IS NOT NULL AS has_name, id] +/// +/// Phase 2 (Push): +/// Push extraction projection through Filter toward TableScan +/// +/// Phase 3 (Recovery): +/// Projection: __extracted_1 IS NOT NULL AS has_name, id <-- recovery +/// Filter: ... +/// Projection: user['name'] AS __extracted_1, id <-- extraction (pushed) +/// TableScan +/// ``` +fn split_and_push_projection( + proj: &Projection, + alias_generator: &Arc, +) -> Result> { + let input = &proj.input; + let input_schema = input.schema(); + + // ── Phase 1: Split ────────────────────────────────────────────────── + // For each projection expression, collect extraction pairs and build + // recovery expressions. + // + // Pre-existing `__extracted` aliases are inserted into the extractor's + // `IndexMap` with the **full** `Expr::Alias(…)` as the key, so the + // alias name participates in equality. This prevents collisions when + // CSE rewrites produce the same inner expression under different alias + // names (e.g. `__common_expr_4 AS __extracted_1` and + // `__common_expr_4 AS __extracted_3`). New extractions from + // `routing_extract` use bare (non-Alias) keys and get normal dedup. + // + // When building the final `extraction_pairs`, the Alias wrapper is + // stripped so consumers see the usual `(inner_expr, alias_name)` tuples. + + let mut extractors = vec![LeafExpressionExtractor::new( + input_schema.as_ref(), + alias_generator, + )]; + let input_column_sets = vec![schema_columns(input_schema.as_ref())]; + + let original_schema = proj.schema.as_ref(); + let mut recovery_exprs: Vec = Vec::with_capacity(proj.expr.len()); + let mut needs_recovery = false; + let mut has_new_extractions = false; + + for (expr, (qualifier, field)) in proj.expr.iter().zip(original_schema.iter()) { + if let Expr::Alias(alias) = expr + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + // Insert the full Alias expression as the key so that + // distinct alias names don't collide in the IndexMap. + let alias_name = alias.name.clone(); + + for col_ref in alias.expr.column_refs() { + extractors[0].columns_needed.insert(col_ref.clone()); + } + + extractors[0] + .extracted + .insert(expr.clone(), alias_name.clone()); + recovery_exprs.push(Expr::Column(Column::new_unqualified(&alias_name))); + } else if let Expr::Column(col) = expr { + // Plain column pass-through — track it in the extractor + extractors[0].columns_needed.insert(col.clone()); + recovery_exprs.push(expr.clone()); + } else { + // Everything else: run through routing_extract + let transformed = + routing_extract(expr.clone(), &mut extractors, &input_column_sets)?; + if transformed.transformed { + has_new_extractions = true; + } + let transformed_expr = transformed.data; + + // Build recovery expression, aliasing back to original name if needed + let original_name = field.name(); + let needs_alias = if let Expr::Column(col) = &transformed_expr { + col.name.as_str() != original_name + } else { + let expr_name = transformed_expr.schema_name().to_string(); + original_name != &expr_name + }; + let recovery_expr = if needs_alias { + needs_recovery = true; + transformed_expr + .clone() + .alias_qualified(qualifier.cloned(), original_name) + } else { + transformed_expr.clone() + }; + + // If the expression was transformed (i.e., has extracted sub-parts), + // it differs from what the pushed projection outputs → needs recovery. + // Also, any non-column, non-__extracted expression needs recovery + // because the pushed extraction projection won't output it directly. + if transformed.transformed || !matches!(expr, Expr::Column(_)) { + needs_recovery = true; + } + + recovery_exprs.push(recovery_expr); + } + } + + // Build extraction_pairs, stripping the Alias wrapper from pre-existing + // entries (they used the full Alias as the map key to avoid dedup). + let extractor = &extractors[0]; + let extraction_pairs: Vec<(Expr, String)> = extractor + .extracted + .iter() + .map(|(e, a)| match e { + Expr::Alias(alias) => (*alias.expr.clone(), a.clone()), + _ => (e.clone(), a.clone()), + }) + .collect(); + let columns_needed = &extractor.columns_needed; + + // If no extractions found, nothing to do + if extraction_pairs.is_empty() { + return Ok(None); + } + + // ── Phase 2: Push down ────────────────────────────────────────────── + let proj_input = Arc::clone(&proj.input); + let pushed = push_extraction_pairs( + &extraction_pairs, + columns_needed, + proj, + &proj_input, + alias_generator, + )?; + + // ── Phase 3: Recovery ─────────────────────────────────────────────── + match (pushed, needs_recovery) { + (Some(pushed_plan), true) => { + // Wrap with recovery projection + let recovery = LogicalPlan::Projection(Projection::try_new( + recovery_exprs, + Arc::new(pushed_plan), + )?); + Ok(Some(recovery)) + } + (Some(pushed_plan), false) => { + // No recovery needed (pure extraction projection) + Ok(Some(pushed_plan)) + } + (None, true) => { + // Push returned None but we still have extractions to apply. + // Build the extraction projection in-place (not pushed) so the + // recovery can resolve extracted expressions. + if !has_new_extractions { + // Only pre-existing __extracted aliases and columns, no new + // extractions from routing_extract. The original projection is + // already an extraction projection that couldn't be pushed + // further. Return None. + return Ok(None); + } + let input_arc = Arc::clone(input); + let extraction = build_extraction_projection_impl( + &extraction_pairs, + columns_needed, + &input_arc, + input_schema.as_ref(), + )?; + let extraction_plan = LogicalPlan::Projection(extraction); + let recovery = LogicalPlan::Projection(Projection::try_new( + recovery_exprs, + Arc::new(extraction_plan), + )?); + Ok(Some(recovery)) + } + (None, false) => { + // No extractions could be pushed and no recovery needed + Ok(None) + } + } +} + +/// Returns true if the plan is a Projection where ALL expressions are either +/// `Alias(EXTRACTED_EXPR_PREFIX, ...)` or `Column`, with at least one extraction. +/// Such projections can safely be pushed further without re-extraction. +fn is_pure_extraction_projection(plan: &LogicalPlan) -> bool { + let LogicalPlan::Projection(proj) = plan else { + return false; + }; + let mut has_extraction = false; + for expr in &proj.expr { + match expr { + Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX) => { + has_extraction = true; + } + Expr::Column(_) => {} + _ => return false, + } } + has_extraction +} + +/// Pushes extraction pairs down through the projection's input node, +/// dispatching to the appropriate handler based on the input node type. +fn push_extraction_pairs( + pairs: &[(Expr, String)], + columns_needed: &IndexSet, + proj: &Projection, + proj_input: &Arc, + alias_generator: &Arc, +) -> Result> { + match proj_input.as_ref() { + // Merge into existing projection, then try to push the result further down. + // Only merge when all outer expressions are captured (pairs + columns). + // Uncaptured expressions (e.g. `col AS __common_expr_1`) would be lost + // during the merge since build_extraction_projection_impl only knows + // about the captured pairs and columns. + LogicalPlan::Projection(_) + if pairs.len() + columns_needed.len() == proj.expr.len() => + { + let target_schema = Arc::clone(proj_input.schema()); + let merged = build_extraction_projection_impl( + pairs, + columns_needed, + proj_input, + target_schema.as_ref(), + )?; + let merged_plan = LogicalPlan::Projection(merged); + + // After merging, try to push the result further down, but ONLY + // if the merged result is still a pure extraction projection + // (all __extracted aliases + columns). If the merge inherited + // bare MoveTowardsLeafNodes expressions from the inner projection, + // pushing would re-extract them into new aliases and fail when + // the (None, true) fallback can't find the original aliases. + // This handles: Extraction → Recovery(cols) → Filter → ... → TableScan + // by pushing through the recovery projection AND the filter in one pass. + if is_pure_extraction_projection(&merged_plan) + && let Some(pushed) = try_push_input(&merged_plan, alias_generator)? + { + return Ok(Some(pushed)); + } + Ok(Some(merged_plan)) + } + // Generic: handles Filter/Sort/Limit (via recursion), + // SubqueryAlias (with qualifier remap in try_push_into_inputs), + // Join, and anything else. + // Safely bails out for nodes that don't pass through extracted + // columns (Aggregate, Window) via the output schema check. + _ => try_push_into_inputs( + pairs, + columns_needed, + proj_input.as_ref(), + alias_generator, + ), + } +} + +/// Pushes extraction expressions into a node's inputs by routing each +/// expression to the input that owns all of its column references. +/// +/// Works for any number of inputs (1, 2, …N). For single-input nodes, +/// all expressions trivially route to that input. For multi-input nodes +/// (Join, etc.), each expression is routed to the side that owns its columns. +/// +/// Returns `Some(new_node)` if all expressions could be routed AND the +/// rebuilt node's output schema contains all extracted aliases. +/// Returns `None` if any expression references columns from multiple inputs +/// or the node doesn't pass through the extracted columns. +/// +/// # Example: Join with expressions from both sides +/// +/// ```text +/// Extraction projection above a Join: +/// Projection: left.user['name'] AS __extracted_1, right.order['total'] AS __extracted_2, ... +/// Join: left.id = right.user_id +/// TableScan: left [id, user] +/// TableScan: right [user_id, order] +/// +/// After routing each expression to its owning input: +/// Join: left.id = right.user_id +/// Projection: user['name'] AS __extracted_1, id, user <-- left-side extraction +/// TableScan: left [id, user] +/// Projection: order['total'] AS __extracted_2, user_id, order <-- right-side extraction +/// TableScan: right [user_id, order] +/// ``` +fn try_push_into_inputs( + pairs: &[(Expr, String)], + columns_needed: &IndexSet, + node: &LogicalPlan, + alias_generator: &Arc, +) -> Result> { + let inputs = node.inputs(); + if inputs.is_empty() { + return Ok(None); + } + + // SubqueryAlias remaps qualifiers between input and output. + // Rewrite pairs/columns from alias-space to input-space before routing. + let (pairs, columns_needed) = if let LogicalPlan::SubqueryAlias(sa) = node { + let mut replace_map = HashMap::new(); + for ((input_q, input_f), (alias_q, alias_f)) in + sa.input.schema().iter().zip(sa.schema.iter()) + { + replace_map.insert( + qualified_name(alias_q, alias_f.name()), + Expr::Column(Column::new(input_q.cloned(), input_f.name())), + ); + } + let remapped_pairs: Vec<(Expr, String)> = pairs + .iter() + .map(|(expr, alias)| { + Ok(( + replace_cols_by_name(expr.clone(), &replace_map)?, + alias.clone(), + )) + }) + .collect::>()?; + let remapped_columns: IndexSet = columns_needed + .iter() + .filter_map(|col| { + let rewritten = + replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?; + if let Expr::Column(c) = rewritten { + Some(c) + } else { + Some(col.clone()) + } + }) + .collect(); + (remapped_pairs, remapped_columns) + } else { + (pairs.to_vec(), columns_needed.clone()) + }; + let pairs = &pairs[..]; + let columns_needed = &columns_needed; + + let num_inputs = inputs.len(); + + // Build per-input column sets using existing schema_columns() + let input_schemas: Vec> = + inputs.iter().map(|i| Arc::clone(i.schema())).collect(); + let input_column_sets: Vec> = + input_schemas.iter().map(|s| schema_columns(s)).collect(); + + // Route pairs and columns to inputs. + // Union: all inputs share the same schema, so broadcast to every branch. + // Everything else (Join, single-input nodes): columns are disjoint across + // inputs, so route each expression to its owning input. + let broadcast = matches!(node, LogicalPlan::Union(_)); + + let mut per_input_pairs: Vec> = vec![vec![]; num_inputs]; + let mut per_input_columns: Vec> = vec![IndexSet::new(); num_inputs]; + + if broadcast { + // Union output schema and each input schema have the same fields by + // index but may differ in qualifiers (e.g. output `s` vs input + // `simple_struct.s`). Remap pairs/columns to each input's space. + let union_schema = node.schema(); + for (idx, input_schema) in input_schemas.iter().enumerate() { + let mut remap = HashMap::new(); + for ((out_q, out_f), (in_q, in_f)) in + union_schema.iter().zip(input_schema.iter()) + { + remap.insert( + qualified_name(out_q, out_f.name()), + Expr::Column(Column::new(in_q.cloned(), in_f.name())), + ); + } + per_input_pairs[idx] = pairs + .iter() + .map(|(expr, alias)| { + Ok((replace_cols_by_name(expr.clone(), &remap)?, alias.clone())) + }) + .collect::>()?; + per_input_columns[idx] = columns_needed + .iter() + .filter_map(|col| { + let rewritten = + replace_cols_by_name(Expr::Column(col.clone()), &remap).ok()?; + if let Expr::Column(c) = rewritten { + Some(c) + } else { + Some(col.clone()) + } + }) + .collect(); + } + } else { + for (expr, alias) in pairs { + match find_owning_input(expr, &input_column_sets) { + Some(idx) => per_input_pairs[idx].push((expr.clone(), alias.clone())), + None => return Ok(None), // Cross-input expression — bail out + } + } + for col in columns_needed { + let col_expr = Expr::Column(col.clone()); + match find_owning_input(&col_expr, &input_column_sets) { + Some(idx) => { + per_input_columns[idx].insert(col.clone()); + } + None => return Ok(None), // Ambiguous column — bail out + } + } + } + + // Check at least one input has extractions to push + if per_input_pairs.iter().all(|p| p.is_empty()) { + return Ok(None); + } + + // Build per-input extraction projections and push them as far as possible + // immediately. This is critical because map_children preserves cached schemas, + // so if the TopDown pass later pushes a child further (changing its output + // schema), the parent node's schema becomes stale. + let mut new_inputs: Vec = Vec::with_capacity(num_inputs); + for (idx, input) in inputs.into_iter().enumerate() { + if per_input_pairs[idx].is_empty() { + new_inputs.push(input.clone()); + } else { + let input_arc = Arc::new(input.clone()); + let target_schema = Arc::clone(input.schema()); + let proj = build_extraction_projection_impl( + &per_input_pairs[idx], + &per_input_columns[idx], + &input_arc, + target_schema.as_ref(), + )?; + // Verify all requested aliases appear in the projection's output. + // A merge may deduplicate if the same expression already exists + // under a different alias, leaving the requested alias missing. + let proj_schema = proj.schema.as_ref(); + for (_expr, alias) in &per_input_pairs[idx] { + if !proj_schema.fields().iter().any(|f| f.name() == alias) { + return Ok(None); + } + } + let proj_plan = LogicalPlan::Projection(proj); + // Try to push the extraction projection further down within + // this input (e.g., through Filter → existing extraction projection). + // This ensures the input's output schema is stable and won't change + // when the TopDown pass later visits children. + match try_push_input(&proj_plan, alias_generator)? { + Some(pushed) => new_inputs.push(pushed), + None => new_inputs.push(proj_plan), + } + } + } + + // Rebuild the node with new inputs + let new_node = node.with_new_exprs(node.expressions(), new_inputs)?; + + // Safety check: verify all extracted aliases appear in the rebuilt + // node's output schema. Nodes like Aggregate define their own output + // and won't pass through extracted columns — bail out for those. + let output_schema = new_node.schema(); + for (_expr, alias) in pairs { + if !output_schema.fields().iter().any(|f| f.name() == alias) { + return Ok(None); + } + } + + Ok(Some(new_node)) } #[cfg(test)] @@ -334,13 +1271,24 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id + Filter: __datafusion_extracted_3 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + TableScan: test projection=[id, user] "#) } @@ -383,10 +1331,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) + TableScan: test projection=[user] "#) } @@ -410,10 +1361,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 IS NOT NULL AS has_name + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) IS NOT NULL AS has_name + TableScan: test projection=[user] "#) } @@ -459,13 +1413,22 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL AND __datafusion_extracted_1 IS NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user + Filter: __datafusion_extracted_2 IS NOT NULL AND __datafusion_extracted_2 IS NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user + Filter: __datafusion_extracted_3 IS NOT NULL AND __datafusion_extracted_3 IS NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] "#) } @@ -482,13 +1445,22 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("test") + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user + Filter: __datafusion_extracted_2 = Utf8("test") + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user + Filter: __datafusion_extracted_3 = Utf8("test") + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] "#) } @@ -507,13 +1479,22 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_2]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: test projection=[user] "#) } @@ -535,13 +1516,22 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(mock_leaf(test.user,Utf8("value"))) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1)]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: test.user, COUNT(__datafusion_extracted_2) AS COUNT(mock_leaf(test.user,Utf8("value"))) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2)]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: test.user, COUNT(__datafusion_extracted_3) AS COUNT(mock_leaf(test.user,Utf8("value"))) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3)]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user + TableScan: test projection=[user] "#) } @@ -560,13 +1550,23 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: mock_leaf(test.user, Utf8("name")) + Projection: test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -586,10 +1586,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS username + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS username + TableScan: test projection=[user] "#) } @@ -611,13 +1614,23 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: test.user, mock_leaf(test.user, Utf8("label")) + Projection: test.user + Filter: __datafusion_extracted_1 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: test.user, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("label")) + Filter: __datafusion_extracted_2 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_3 + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: test.user, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("label")) + Filter: __datafusion_extracted_4 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -638,10 +1651,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")), mock_leaf(test.user, Utf8("name")) AS name2 + TableScan: test projection=[user] "#) } @@ -668,10 +1684,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Sort: test.user ASC NULLS FIRST + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Sort: test.user ASC NULLS FIRST + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] "#) } @@ -694,10 +1716,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Limit: skip=0, fetch=10 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Limit: skip=0, fetch=10 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -720,13 +1748,19 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1) AS cnt]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2) AS cnt]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3) AS cnt]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user + TableScan: test projection=[user] "#) } @@ -799,13 +1833,28 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + Projection: test.id, test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user + Filter: __datafusion_extracted_3 IS NOT NULL + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user + Filter: __datafusion_extracted_5 IS NOT NULL + Projection: test.id, test.user, __datafusion_extracted_5 + Filter: __datafusion_extracted_6 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[id, user] "#) } @@ -827,10 +1876,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) + TableScan: test projection=[user] "#) } @@ -900,13 +1952,28 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_5]], aggr=[[COUNT(Int32(1))]] + Projection: __datafusion_extracted_5 + Filter: __datafusion_extracted_6 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -926,13 +1993,28 @@ mod tests { TableScan: test projection=[a, b, c] ## After Extraction - (same as original) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_1 = Int32(2) + Projection: mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_2 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c + TableScan: test projection=[a, b, c] ## After Pushdown - (same as after extraction) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_3 = Int32(2) + Filter: __datafusion_extracted_4 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_4, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_3 + TableScan: test projection=[a, b, c] ## Optimized - (same as after pushdown) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_5 = Int32(2) + Projection: test.a, test.b, test.c, __datafusion_extracted_5 + Filter: __datafusion_extracted_6 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_6, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_5 + TableScan: test projection=[a, b, c] "#) } @@ -974,13 +2056,28 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_3 = __datafusion_extracted_4 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_4, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user + TableScan: right projection=[id, user] "#) } @@ -1010,13 +2107,25 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] "#) } @@ -1047,13 +2156,28 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") AND __datafusion_extracted_2 = Utf8("admin") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") AND __datafusion_extracted_4 = Utf8("admin") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_5 = Utf8("active") AND __datafusion_extracted_6 = Utf8("admin") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_6, right.id, right.user + TableScan: right projection=[id, user] "#) } @@ -1115,13 +2239,34 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_4 = Utf8("active") + Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_7 = Utf8("active") + Projection: test.id, test.user, __datafusion_extracted_7, right.id, right.user + Inner Join: __datafusion_extracted_8 = __datafusion_extracted_9 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_8, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_9, right.id, right.user + TableScan: right projection=[id, user] "#) } @@ -1153,10 +2298,20 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) + Inner Join: test.id = right.id + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("role")) + Inner Join: test.id = right.id + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id + TableScan: right projection=[id, user] "#) } @@ -1185,10 +2340,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(x,Utf8("a")) + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1213,10 +2374,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1236,13 +2403,23 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: x + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(x, Utf8("a")) AS __datafusion_extracted_1, x + Projection: test.user AS x + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: x + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: x + Filter: __datafusion_extracted_3 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_3 + TableScan: test projection=[user] "#) } @@ -1269,10 +2446,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1294,13 +2477,26 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: mock_leaf(sub.user, Utf8("name")) + Projection: sub.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user + SubqueryAlias: sub + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_3 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_2 = Utf8("active") + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_5 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_4 = Utf8("active") + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -1325,10 +2521,18 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) + SubqueryAlias: outer_sub + SubqueryAlias: inner_sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(outer_sub.user,Utf8("name")) + SubqueryAlias: outer_sub + SubqueryAlias: inner_sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1453,13 +2657,24 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_3 = Utf8("a") AND __datafusion_extracted_4 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_4, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id + Filter: __datafusion_extracted_5 = Utf8("a") AND __datafusion_extracted_6 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_5, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_6, test.id + TableScan: test projection=[id, user] "#) } @@ -1484,13 +2699,23 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("name")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[id, user] "#) } @@ -1511,13 +2736,23 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("status")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_2 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_4 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5 + TableScan: test projection=[id, user] "#) } @@ -1554,13 +2789,28 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(right.user, Utf8("status")) + Projection: test.id, test.user, right.id, right.user + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_2 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_4 + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_5 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, test.id + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_5, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_7 + TableScan: right projection=[id, user] "#) } @@ -1585,13 +2835,23 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(test.user, Utf8("status")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_2 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_5 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + TableScan: test projection=[id, user] "#) } } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 877a84fe4dc14..118ddef49b7e7 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -43,6 +43,7 @@ use crate::eliminate_join::EliminateJoin; use crate::eliminate_limit::EliminateLimit; use crate::eliminate_outer_join::EliminateOuterJoin; use crate::extract_equijoin_predicate::ExtractEquijoinPredicate; +use crate::extract_leaf_expressions::{ExtractLeafExpressions, PushDownLeafProjections}; use crate::filter_null_join_keys::FilterNullJoinKeys; use crate::optimize_projections::OptimizeProjections; use crate::optimize_unions::OptimizeUnions; @@ -260,6 +261,8 @@ impl Optimizer { // that might benefit from the following rules Arc::new(EliminateGroupByConstant::new()), Arc::new(CommonSubexprEliminate::new()), + Arc::new(ExtractLeafExpressions::new()), + Arc::new(PushDownLeafProjections::new()), Arc::new(OptimizeProjections::new()), ]; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 6f615ec391c9e..c5907d497500e 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -197,6 +197,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -219,6 +221,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true @@ -558,6 +562,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -580,6 +586,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index 5a4411233424a..c6885ae40b3e9 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -244,7 +244,7 @@ query TT explain select column1.c0 from t; ---- logical_plan -01)Projection: get_field(t.column1, Utf8("c0")) +01)Projection: get_field(t.column1, Utf8("c0")) AS t.column1[c0] 02)--TableScan: t projection=[column1] physical_plan 01)ProjectionExec: expr=[get_field(column1@0, c0) as t.column1[c0]] diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 50e26b2fb0b85..6dfa66cda51c9 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -104,7 +104,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet @@ -122,7 +122,7 @@ query TT EXPLAIN SELECT s['label'] FROM simple_struct; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("label")) +01)Projection: get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 02)--TableScan: simple_struct projection=[s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -144,7 +144,7 @@ query TT EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -166,7 +166,7 @@ query TT EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; ---- logical_plan -01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] 02)--TableScan: nested_struct projection=[id, nested] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet @@ -186,7 +186,7 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet @@ -208,7 +208,7 @@ query TT EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet @@ -235,13 +235,14 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -259,13 +260,14 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +01)Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] -02)--FilterExec: id@0 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -283,13 +285,14 @@ query TT EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) -02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[label] +02)--Filter: __datafusion_extracted_1 > Int64(150) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 +04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] -02)--FilterExec: get_field(s@1, value) > 150 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as simple_struct.s[label]] +02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet # Verify correctness query IT @@ -313,7 +316,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -338,7 +341,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -363,7 +366,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] @@ -437,7 +440,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -460,7 +463,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -483,7 +486,7 @@ EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3 ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -506,7 +509,7 @@ EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: nested_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] 03)----TableScan: nested_struct projection=[id, nested] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -528,7 +531,7 @@ EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -556,14 +559,15 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -583,14 +587,15 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -608,14 +613,15 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LI ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -673,7 +679,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] @@ -699,7 +705,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -723,7 +729,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) AS multi_struct.s[value] + Int64(1) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -747,16 +753,17 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +02)--Projection: multi_struct.id, __datafusion_extracted_1 AS multi_struct.s[value] 03)----Filter: multi_struct.id > Int64(2) -04)------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] +04)------Projection: get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_1, multi_struct.id +05)--------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] 02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as multi_struct.s[value]] -04)------FilterExec: id@0 > 2 +03)----ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as multi_struct.s[value]] +04)------FilterExec: id@1 > 2 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 -06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -774,13 +781,16 @@ query TT EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label']; ---- logical_plan -01)Aggregate: groupBy=[[get_field(multi_struct.s, Utf8("label"))]], aggr=[[sum(get_field(multi_struct.s, Utf8("value")))]] -02)--TableScan: multi_struct projection=[s] +01)Projection: __datafusion_extracted_1 AS multi_struct.s[label], sum(__datafusion_extracted_2) AS sum(multi_struct.s[value]) +02)--Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[sum(__datafusion_extracted_2)]] +03)----Projection: get_field(multi_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_2 +04)------TableScan: multi_struct projection=[s] physical_plan -01)AggregateExec: mode=FinalPartitioned, gby=[multi_struct.s[label]@0 as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] -02)--RepartitionExec: partitioning=Hash([multi_struct.s[label]@0], 4), input_partitions=3 -03)----AggregateExec: mode=Partial, gby=[get_field(s@0, label) as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] -04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[s], file_type=parquet +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as multi_struct.s[label], sum(__datafusion_extracted_2)@1 as sum(multi_struct.s[value])] +02)--AggregateExec: mode=FinalPartitioned, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] +03)----RepartitionExec: partitioning=Hash([__datafusion_extracted_1@0], 4), input_partitions=3 +04)------AggregateExec: mode=Partial, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] +05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet # Verify correctness query TI @@ -809,7 +819,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM nullable_struct; ---- logical_plan -01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) AS nullable_struct.s[value] 02)--TableScan: nullable_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet @@ -831,13 +841,14 @@ query TT EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL; ---- logical_plan -01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) -02)--Filter: get_field(nullable_struct.s, Utf8("value")) IS NOT NULL -03)----TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] +01)Projection: nullable_struct.id, __datafusion_extracted_2 AS nullable_struct.s[label] +02)--Filter: __datafusion_extracted_1 IS NOT NULL +03)----Projection: get_field(nullable_struct.s, Utf8("value")) AS __datafusion_extracted_1, nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) AS __datafusion_extracted_2 +04)------TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as nullable_struct.s[label]] -02)--FilterExec: get_field(s@1, value) IS NOT NULL -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as nullable_struct.s[label]] +02)--FilterExec: __datafusion_extracted_1@0 IS NOT NULL, projection=[id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet # Verify correctness query IT @@ -856,7 +867,7 @@ EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("value")) + Int64(10), get_field(simple_struct.s, Utf8("label")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("value")) + Int64(10) AS simple_struct.s[value] + Int64(10), get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -954,27 +965,29 @@ EXPLAIN SELECT (id + s['value']) * (id + s['value']) as id_and_value FROM simple ---- logical_plan 01)Projection: __common_expr_1 * __common_expr_1 AS id_and_value -02)--Projection: simple_struct.id + get_field(simple_struct.s, Utf8("value")) AS __common_expr_1 +02)--Projection: simple_struct.id + __datafusion_extracted_2 AS __common_expr_1 03)----Filter: simple_struct.id > Int64(2) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan 01)ProjectionExec: expr=[__common_expr_1@0 * __common_expr_1@0 as id_and_value] -02)--ProjectionExec: expr=[id@0 + get_field(s@1, value) as __common_expr_1] -03)----FilterExec: id@0 > 2 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 + __datafusion_extracted_2@0 as __common_expr_1] +03)----FilterExec: id@1 > 2 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] query TT EXPLAIN SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) + get_field(simple_struct.s, Utf8("value")) AS doubled +01)Projection: __datafusion_extracted_1 + __datafusion_extracted_1 AS doubled 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) + get_field(s@0, value) as doubled] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 + __datafusion_extracted_1@0 as doubled] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -992,13 +1005,14 @@ query TT EXPLAIN SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value], get_field(s@0, label) as simple_struct.s[label]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label]] +02)--FilterExec: id@2 > 2, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query IT @@ -1041,13 +1055,14 @@ query TT EXPLAIN SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) * Int64(2) + CAST(character_length(get_field(simple_struct.s, Utf8("label"))) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score +01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) * 2 + CAST(character_length(get_field(s@0, label)) AS Int64) as score] -02)--FilterExec: id@0 > 1, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 * 2 + CAST(character_length(__datafusion_extracted_2@1) AS Int64) as score] +02)--FilterExec: id@2 > 1, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query I @@ -1073,7 +1088,7 @@ EXPLAIN SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) +02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1096,7 +1111,7 @@ EXPLAIN SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100) AS simple_struct.s[value] + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") AS simple_struct.s[label] || Utf8("_test") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1117,13 +1132,14 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -1136,13 +1152,14 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5); ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) AND (simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 AND (id@0 < 4 OR id@0 = 5), projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 AND (id@1 < 4 OR id@1 = 5), projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] # Verify correctness - should return rows where (id > 1) AND ((id < 4) OR (id = 5)) # That's: id=2,3 (1 1 AND id < 5; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) AND simple_struct.id < Int64(5) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 AND id@0 < 5, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 AND id@1 < 5, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] # Verify correctness - should return rows where 1 < id < 5 (id=2,3,4) query I @@ -1177,13 +1195,14 @@ query TT EXPLAIN SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")), simple_struct.id +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label], simple_struct.id 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label], id@0 as id] -02)--FilterExec: id@0 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label], id@2 as id] +02)--FilterExec: id@2 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness - note that id is now at index 2 in the augmented projection query ITI @@ -1197,13 +1216,14 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) -02)--Filter: character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4) -03)----TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] +01)Projection: __datafusion_extracted_2 AS simple_struct.s[value] +02)--Filter: character_length(__datafusion_extracted_1) > Int32(4) +03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2 +04)------TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: character_length(get_field(s@0, label)) > 4 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet +01)ProjectionExec: expr=[__datafusion_extracted_2@0 as simple_struct.s[value]] +02)--FilterExec: character_length(__datafusion_extracted_1@0) > 4, projection=[__datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet # Verify correctness - filter on rows where label length > 4 (all have length 5, except 'one' has 3) # Wait, from the data: alpha(5), beta(4), gamma(5), delta(5), epsilon(7) @@ -1230,12 +1250,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1258,13 +1279,13 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label']; ---- logical_plan 01)Projection: simple_struct.id, simple_struct.s[value] -02)--Sort: simple_struct.id ASC NULLS LAST, get_field(simple_struct.s, Utf8("label")) ASC NULLS LAST -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), simple_struct.s +02)--Sort: simple_struct.id ASC NULLS LAST, __datafusion_extracted_1 ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1 04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id, simple_struct.s[value]@1 as simple_struct.s[value]] -02)--SortExec: expr=[id@0 ASC NULLS LAST, get_field(s@2, label) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], s], file_type=parquet +02)--SortExec: expr=[id@0 ASC NULLS LAST, __datafusion_extracted_1@2 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as __datafusion_extracted_1], file_type=parquet # Verify correctness query II @@ -1287,12 +1308,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST, fetch=2 -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 ASC NULLS LAST, fetch=2 +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: TopK(fetch=2), expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: TopK(fetch=2), expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1312,12 +1334,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] * 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) * Int64(2) ASC NULLS LAST -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 * Int64(2) ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[get_field(s@1, value) * 2 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: expr=[__datafusion_extracted_1@1 * 2 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1339,7 +1362,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value']; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST, simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] @@ -1390,13 +1413,15 @@ INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10; ---- logical_plan 01)Projection: simple_struct.id, join_right.id -02)--Inner Join: get_field(simple_struct.s, Utf8("value")) = get_field(join_right.s, Utf8("level")) * Int64(10) -03)----TableScan: simple_struct projection=[id, s] -04)----TableScan: join_right projection=[id, s] +02)--Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 * Int64(10) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] physical_plan -01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(simple_struct.s[value]@2, join_right.s[level] * Int64(10)@2)], projection=[id@0, id@3] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s, get_field(s@1, level) * 10 as join_right.s[level] * Int64(10)], file_type=parquet +01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__datafusion_extracted_1@0, __datafusion_extracted_2 * Int64(10)@2)], projection=[id@1, id@3] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet # Verify correctness - value = level * 10 # simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) @@ -1424,13 +1449,14 @@ WHERE simple_struct.s['value'] > 150; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] -05)--TableScan: join_right projection=[id] +03)----Filter: __datafusion_extracted_1 > Int64(150) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +06)--TableScan: join_right projection=[id] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: get_field(s@1, value) > 150, projection=[id@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet 04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - id matches and value > 150 @@ -1459,17 +1485,19 @@ WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(100) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] -05)--Projection: join_right.id -06)----Filter: get_field(join_right.s, Utf8("level")) > Int64(3) -07)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] +03)----Filter: __datafusion_extracted_1 > Int64(100) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] +06)--Projection: join_right.id +07)----Filter: __datafusion_extracted_2 > Int64(3) +08)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +09)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: get_field(s@1, value) > 100, projection=[id@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet -04)--FilterExec: get_field(s@1, level) > 3, projection=[id@0] -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] +02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1] +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet # Verify correctness - id matches, value > 100, and level > 3 # Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250) @@ -1495,15 +1523,17 @@ FROM simple_struct INNER JOIN join_right ON simple_struct.id = join_right.id; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")), get_field(join_right.s, Utf8("role")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[label], __datafusion_extracted_2 AS join_right.s[role] 02)--Inner Join: simple_struct.id = join_right.id -03)----TableScan: simple_struct projection=[id, s] -04)----TableScan: join_right projection=[id, s] +03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label], get_field(s@2, role) as join_right.s[role]] -02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@0, s@1, s@3] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[label], __datafusion_extracted_2@2 as join_right.s[role]] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], projection=[__datafusion_extracted_1@0, id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, id], file_type=parquet +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness query ITT @@ -1561,17 +1591,20 @@ FROM simple_struct LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(join_right.s, Utf8("level")) +01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[value], __datafusion_extracted_3 AS join_right.s[level] 02)--Left Join: simple_struct.id = join_right.id -03)----TableScan: simple_struct projection=[id, s] -04)----Filter: get_field(join_right.s, Utf8("level")) > Int64(5) -05)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: join_right.id, __datafusion_extracted_3 +06)------Filter: __datafusion_extracted_1 > Int64(5) +07)--------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_1, join_right.id, get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_3 +08)----------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] physical_plan -01)ProjectionExec: expr=[id@1 as id, get_field(s@2, value) as simple_struct.s[value], get_field(s@0, level) as join_right.s[level]] -02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(id@0, id@0)], projection=[s@1, id@2, s@3] -03)----FilterExec: get_field(s@1, level) > 5 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_2@0 as simple_struct.s[value], __datafusion_extracted_3@2 as join_right.s[level]] +02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@1, id@0)], projection=[__datafusion_extracted_2@0, id@1, __datafusion_extracted_3@3] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet +04)----FilterExec: __datafusion_extracted_1@0 > 5, projection=[id@1, __datafusion_extracted_3@2] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet # Verify correctness - left join with level > 5 condition # Only join_right rows with level > 5 are matched: id=1 (level=10), id=4 (level=8) @@ -1599,14 +1632,15 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] 03)----RepartitionExec: partitioning=RoundRobinBatch(32), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] ##################### # Section 14: SubqueryAlias tests @@ -1621,15 +1655,16 @@ query TT EXPLAIN SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2; ---- logical_plan -01)Projection: get_field(t.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS t.s[value] 02)--SubqueryAlias: t -03)----Projection: simple_struct.s +03)----Projection: __datafusion_extracted_1 04)------Filter: simple_struct.id > Int64(2) -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +05)--------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1645,9 +1680,10 @@ EXPLAIN SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t O ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) +02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] 03)----SubqueryAlias: t -04)------TableScan: simple_struct projection=[s] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 +05)--------TableScan: simple_struct projection=[s] physical_plan 01)SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as t.s[value], get_field(s@1, label) as t.s[label]], file_type=parquet @@ -1667,16 +1703,17 @@ query TT EXPLAIN SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2; ---- logical_plan -01)Projection: get_field(u.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS u.s[value] 02)--SubqueryAlias: u 03)----SubqueryAlias: t -04)------Projection: simple_struct.s +04)------Projection: __datafusion_extracted_1 05)--------Filter: simple_struct.id > Int64(2) -06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as u.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as u.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1693,11 +1730,12 @@ EXPLAIN SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 20 logical_plan 01)SubqueryAlias: t 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(200) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] +03)----Filter: __datafusion_extracted_1 > Int64(200) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] physical_plan -01)FilterExec: get_field(s@1, value) > 200, projection=[id@0] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)FilterExec: __datafusion_extracted_1@0 > 200, projection=[id@1] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet # Verify correctness query I @@ -1719,22 +1757,24 @@ EXPLAIN SELECT s['value'] FROM ( ) t; ---- logical_plan -01)Projection: get_field(t.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS t.s[value] 02)--SubqueryAlias: t 03)----Union -04)------Projection: simple_struct.s +04)------Projection: __datafusion_extracted_1 05)--------Filter: simple_struct.id <= Int64(3) -06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -07)------Projection: simple_struct.s -08)--------Filter: simple_struct.id > Int64(3) -09)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +08)------Projection: __datafusion_extracted_1 +09)--------Filter: simple_struct.id > Int64(3) +10)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +11)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] 02)--UnionExec -03)----FilterExec: id@0 <= 3, projection=[s@1] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -05)----FilterExec: id@0 > 3, projection=[s@1] -06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +03)----FilterExec: id@1 <= 3, projection=[__datafusion_extracted_1@0] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +05)----FilterExec: id@1 > 3, projection=[__datafusion_extracted_1@0] +06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query I @@ -1760,24 +1800,26 @@ EXPLAIN SELECT s['value'], s['label'] FROM ( ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) +02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] 03)----SubqueryAlias: t 04)------Union -05)--------Projection: simple_struct.s +05)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 06)----------Filter: simple_struct.id <= Int64(3) -07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -08)--------Projection: simple_struct.s -09)----------Filter: simple_struct.id > Int64(3) -10)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +07)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +08)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +09)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 +10)----------Filter: simple_struct.id > Int64(3) +11)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +12)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan 01)SortPreservingMergeExec: [t.s[value]@0 ASC NULLS LAST] 02)--SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[get_field(s@0, value) as t.s[value], get_field(s@0, label) as t.s[label]] +03)----ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value], __datafusion_extracted_2@1 as t.s[label]] 04)------UnionExec -05)--------FilterExec: id@0 <= 3, projection=[s@1] -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -07)--------FilterExec: id@0 > 3, projection=[s@1] -08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +05)--------FilterExec: id@2 <= 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +07)--------FilterExec: id@2 > 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query IT diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index b1cb354e053e4..edafcfaa543f2 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -116,11 +116,12 @@ explain select * from (select column1, unnest(column2) as o from d) where o['a'] ---- physical_plan 01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o] -02)--FilterExec: get_field(__unnest_placeholder(d.column2,depth=1)@1, a) = 1 +02)--FilterExec: __datafusion_extracted_1@0 = 1, projection=[column1@1, __unnest_placeholder(d.column2,depth=1)@2] 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------UnnestExec -05)--------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +04)------ProjectionExec: expr=[get_field(__unnest_placeholder(d.column2,depth=1)@1, a) as __datafusion_extracted_1, column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as __unnest_placeholder(d.column2,depth=1)] +05)--------UnnestExec +06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] +07)------------DataSourceExec: partitions=1, partition_sizes=[1] statement ok drop table d; diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index e20815a58c765..09dd98a50b579 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -661,7 +661,7 @@ query TT explain select s['a']['b'] from explain_test; ---- logical_plan -01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) +01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) AS explain_test.s[a][b] 02)--TableScan: explain_test projection=[s] physical_plan 01)ProjectionExec: expr=[get_field(s@0, a, b) as explain_test.s[a][b]] @@ -1666,4 +1666,4 @@ order by id; 3 2 150 statement ok -drop table t_agg_window; \ No newline at end of file +drop table t_agg_window; diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 1a6b82020c667..73aeb6c99d0db 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -666,7 +666,7 @@ explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unn logical_plan 01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 02)--Unnest: lists[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1])|depth=2] structs[] -03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 +03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[] 05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3 06)----------TableScan: recursive_unnest_table projection=[column3] From 209fe4211fc82f01ea2812c20145fa84868eb268 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 9 Feb 2026 23:35:21 -0500 Subject: [PATCH 2/9] avoid unstable test snaps --- .../optimizer/src/extract_leaf_expressions.rs | 302 +++++++----------- 1 file changed, 117 insertions(+), 185 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index eede292c1a367..53b2c29abe13d 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -1188,9 +1188,8 @@ mod tests { /// 3. **After Pushdown** - + PushDownLeafProjections /// 4. **Optimized** - + final OptimizeProjections fn format_optimization_stages(plan: &LogicalPlan) -> Result { - let ctx = OptimizerContext::new().with_max_passes(1); - let run = |rules: Vec>| -> Result { + let ctx = OptimizerContext::new().with_max_passes(1); let optimizer = Optimizer::with_rules(rules); let optimized = optimizer.optimize(plan.clone(), &ctx, |_, _| {})?; Ok(format!("{optimized}")) @@ -1278,16 +1277,12 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id - Projection: test.id, test.user - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized Projection: test.id - Filter: __datafusion_extracted_3 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id TableScan: test projection=[id, user] "#) } @@ -1419,16 +1414,10 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, test.user - Filter: __datafusion_extracted_2 IS NOT NULL AND __datafusion_extracted_2 IS NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user - Filter: __datafusion_extracted_3 IS NOT NULL AND __datafusion_extracted_3 IS NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -1451,16 +1440,10 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, test.user - Filter: __datafusion_extracted_2 = Utf8("test") - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user - Filter: __datafusion_extracted_3 = Utf8("test") - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -1485,15 +1468,12 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_2]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1522,16 +1502,10 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: test.user, COUNT(__datafusion_extracted_2) AS COUNT(mock_leaf(test.user,Utf8("value"))) - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2)]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: test.user, COUNT(__datafusion_extracted_3) AS COUNT(mock_leaf(test.user,Utf8("value"))) - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3)]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1557,15 +1531,15 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] "#) } @@ -1621,16 +1595,13 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: test.user, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("label")) - Filter: __datafusion_extracted_2 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_3 + Projection: test.user, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("label")) + Filter: __datafusion_extracted_1 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_2 TableScan: test projection=[user] ## Optimized - Projection: test.user, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("label")) - Filter: __datafusion_extracted_4 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_5 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1690,10 +1661,7 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) - Sort: test.user ASC NULLS FIRST - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1722,9 +1690,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) Limit: skip=0, fetch=10 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1753,14 +1721,10 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2) AS cnt]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3) AS cnt]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1843,17 +1807,17 @@ mod tests { ## After Pushdown Projection: test.id, test.user - Filter: __datafusion_extracted_3 IS NOT NULL - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Filter: __datafusion_extracted_1 IS NOT NULL + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] ## Optimized Projection: test.id, test.user - Filter: __datafusion_extracted_5 IS NOT NULL - Projection: test.id, test.user, __datafusion_extracted_5 - Filter: __datafusion_extracted_6 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Filter: __datafusion_extracted_1 IS NOT NULL + Projection: test.id, test.user, __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] "#) } @@ -1961,18 +1925,18 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_5]], aggr=[[COUNT(Int32(1))]] - Projection: __datafusion_extracted_5 - Filter: __datafusion_extracted_6 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2003,17 +1967,17 @@ mod tests { ## After Pushdown Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_3 = Int32(2) - Filter: __datafusion_extracted_4 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_4, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_3 + Filter: __datafusion_extracted_1 = Int32(2) + Filter: __datafusion_extracted_2 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 TableScan: test projection=[a, b, c] ## Optimized Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_5 = Int32(2) - Projection: test.a, test.b, test.c, __datafusion_extracted_5 - Filter: __datafusion_extracted_6 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_6, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_5 + Filter: __datafusion_extracted_1 = Int32(2) + Projection: test.a, test.b, test.c, __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 TableScan: test projection=[a, b, c] "#) } @@ -2064,20 +2028,10 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, test.user, right.id, right.user - Inner Join: __datafusion_extracted_3 = __datafusion_extracted_4 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_4, right.id, right.user - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user, right.id, right.user - Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2114,18 +2068,10 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2164,20 +2110,10 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") AND __datafusion_extracted_4 = Utf8("admin") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id, right.user - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_5 = Utf8("active") AND __datafusion_extracted_6 = Utf8("admin") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_6, right.id, right.user - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2251,21 +2187,21 @@ mod tests { ## After Pushdown Projection: test.id, test.user, right.id, right.user - Filter: __datafusion_extracted_4 = Utf8("active") - Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + Filter: __datafusion_extracted_1 = Utf8("active") + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user TableScan: right projection=[id, user] ## Optimized Projection: test.id, test.user, right.id, right.user - Filter: __datafusion_extracted_7 = Utf8("active") - Projection: test.id, test.user, __datafusion_extracted_7, right.id, right.user - Inner Join: __datafusion_extracted_8 = __datafusion_extracted_9 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_8, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.id, test.user, __datafusion_extracted_1, right.id, right.user + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_9, right.id, right.user + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user TableScan: right projection=[id, user] "#) } @@ -2306,11 +2242,11 @@ mod tests { TableScan: right projection=[id, user] ## Optimized - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("role")) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) Inner Join: test.id = right.id - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id TableScan: right projection=[id, user] "#) } @@ -2346,9 +2282,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(x,Utf8("a")) + Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2380,9 +2316,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2411,14 +2347,14 @@ mod tests { ## After Pushdown Projection: x - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized Projection: x - Filter: __datafusion_extracted_3 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_3 + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2452,9 +2388,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2485,17 +2421,17 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_3 AS mock_leaf(sub.user,Utf8("name")) - Filter: __datafusion_extracted_2 = Utf8("active") + Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.user + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_5 AS mock_leaf(sub.user,Utf8("name")) - Filter: __datafusion_extracted_4 = Utf8("active") + Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] "#) } @@ -2528,10 +2464,10 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(outer_sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) SubqueryAlias: outer_sub SubqueryAlias: inner_sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2664,16 +2600,12 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id - Projection: test.id, test.user - Filter: __datafusion_extracted_3 = Utf8("a") AND __datafusion_extracted_4 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_4, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized Projection: test.id - Filter: __datafusion_extracted_5 = Utf8("a") AND __datafusion_extracted_6 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_5, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_6, test.id + Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id TableScan: test projection=[id, user] "#) } @@ -2706,15 +2638,15 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] "#) } @@ -2743,15 +2675,15 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_2 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_4 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] "#) } @@ -2797,19 +2729,19 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("status")) - Left Join: Filter: test.id = right.id AND __datafusion_extracted_2 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_4 + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: right projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(right.user,Utf8("status")) - Left Join: Filter: test.id = right.id AND __datafusion_extracted_5 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, test.id + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_5, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_7 + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: right projection=[id, user] "#) } @@ -2842,15 +2774,15 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_2 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_5 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: test projection=[id, user] "#) } From fada05c0650918e2de9c9b1bc9114a4055e905d2 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 9 Feb 2026 23:52:16 -0500 Subject: [PATCH 3/9] use common function --- .../optimizer/src/extract_leaf_expressions.rs | 131 +++--------------- datafusion/optimizer/src/test/udfs.rs | 42 ++++-- 2 files changed, 56 insertions(+), 117 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 53b2c29abe13d..18f7e53b1afe8 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -1104,72 +1104,25 @@ mod tests { use super::*; use crate::optimize_projections::OptimizeProjections; + use crate::test::udfs::PlacementTestUDF; use crate::test::*; use crate::{Optimizer, OptimizerContext}; - use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::expr::ScalarFunction; + use datafusion_expr::{Expr, ExpressionPlacement}; use datafusion_expr::{ - ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, - TypeSignature, col, lit, logical_plan::builder::LogicalPlanBuilder, + ScalarUDF, col, lit, logical_plan::builder::LogicalPlanBuilder, }; - use datafusion_expr::{Expr, ExpressionPlacement}; - - /// A mock UDF that simulates a leaf-pushable function like `get_field`. - /// It returns `MoveTowardsLeafNodes` when its first argument is Column or MoveTowardsLeafNodes. - #[derive(Debug, PartialEq, Eq, Hash)] - struct MockLeafFunc { - signature: Signature, - } - - impl MockLeafFunc { - fn new() -> Self { - Self { - signature: Signature::new( - TypeSignature::Any(2), - datafusion_expr::Volatility::Immutable, - ), - } - } - } - - impl ScalarUDFImpl for MockLeafFunc { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn name(&self) -> &str { - "mock_leaf" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn return_type(&self, _args: &[DataType]) -> Result { - Ok(DataType::Utf8) - } - - fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { - unimplemented!("This is only used for testing optimization") - } - - fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement { - // Return MoveTowardsLeafNodes if first arg is Column or MoveTowardsLeafNodes - // (like get_field does) - match args.first() { - Some(ExpressionPlacement::Column) - | Some(ExpressionPlacement::MoveTowardsLeafNodes) => { - ExpressionPlacement::MoveTowardsLeafNodes - } - _ => ExpressionPlacement::KeepInPlace, - } - } - } fn mock_leaf(expr: Expr, name: &str) -> Expr { Expr::ScalarFunction(ScalarFunction::new_udf( - Arc::new(ScalarUDF::new_from_impl(MockLeafFunc::new())), + Arc::new(ScalarUDF::new_from_impl( + PlacementTestUDF::new() + .with_placement(ExpressionPlacement::MoveTowardsLeafNodes) + // Use mock_leaf to minimize snapshot churn vs. previous implementation that used a UDF with this name. + // We can remove this name change and accept the snapshot diff in the future. + .with_name("mock_leaf"), + )), vec![expr, lit(name)], )) } @@ -2497,63 +2450,23 @@ mod tests { ") } - /// A variant of MockLeafFunc with the same `name()` but a different concrete type. - /// Used to verify that deduplication uses `Expr` equality, not `schema_name`. - #[derive(Debug, PartialEq, Eq, Hash)] - struct MockLeafFuncVariant { - signature: Signature, - } - - impl MockLeafFuncVariant { - fn new() -> Self { - Self { - signature: Signature::new( - TypeSignature::Any(2), - datafusion_expr::Volatility::Immutable, - ), - } - } - } - - impl ScalarUDFImpl for MockLeafFuncVariant { - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn name(&self) -> &str { - "mock_leaf" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn return_type(&self, _args: &[DataType]) -> Result { - Ok(DataType::Utf8) - } - - fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result { - unimplemented!("This is only used for testing optimization") - } - - fn placement(&self, args: &[ExpressionPlacement]) -> ExpressionPlacement { - match args.first() { - Some(ExpressionPlacement::Column) - | Some(ExpressionPlacement::MoveTowardsLeafNodes) => { - ExpressionPlacement::MoveTowardsLeafNodes - } - _ => ExpressionPlacement::KeepInPlace, - } - } - } - /// Two UDFs with the same `name()` but different concrete types should NOT be /// deduplicated -- they are semantically different expressions that happen to /// collide on `schema_name()`. #[test] fn test_different_udfs_same_schema_name_not_deduplicated() -> Result<()> { - let udf_a = Arc::new(ScalarUDF::new_from_impl(MockLeafFunc::new())); - let udf_b = Arc::new(ScalarUDF::new_from_impl(MockLeafFuncVariant::new())); + let udf_a = Arc::new(ScalarUDF::new_from_impl( + PlacementTestUDF::new() + .with_placement(ExpressionPlacement::MoveTowardsLeafNodes) + .with_name("mock_leaf") + .with_id(1), + )); + let udf_b = Arc::new(ScalarUDF::new_from_impl( + PlacementTestUDF::new() + .with_placement(ExpressionPlacement::MoveTowardsLeafNodes) + .with_name("mock_leaf") + .with_id(2), + )); let expr_a = Expr::ScalarFunction(ScalarFunction::new_udf( udf_a, diff --git a/datafusion/optimizer/src/test/udfs.rs b/datafusion/optimizer/src/test/udfs.rs index 0e68568decf85..f68b1fb33da25 100644 --- a/datafusion/optimizer/src/test/udfs.rs +++ b/datafusion/optimizer/src/test/udfs.rs @@ -21,7 +21,7 @@ use arrow::datatypes::DataType; use datafusion_common::Result; use datafusion_expr::{ ColumnarValue, Expr, ExpressionPlacement, ScalarFunctionArgs, ScalarUDF, - ScalarUDFImpl, Signature, Volatility, + ScalarUDFImpl, Signature, TypeSignature, }; /// A configurable test UDF for optimizer tests. @@ -30,6 +30,8 @@ use datafusion_expr::{ pub struct PlacementTestUDF { signature: Signature, placement: ExpressionPlacement, + name: String, + id: usize, } impl Default for PlacementTestUDF { @@ -41,13 +43,42 @@ impl Default for PlacementTestUDF { impl PlacementTestUDF { pub fn new() -> Self { Self { - signature: Signature::exact(vec![DataType::UInt32], Volatility::Immutable), + name: "leaf_udf".to_string(), + // Accept any two arguments and return UInt32 for testing purposes. + // The actual types don't matter since this UDF is not intended for execution. + signature: Signature::new( + TypeSignature::Any(2), + datafusion_expr::Volatility::Immutable, + ), placement: ExpressionPlacement::MoveTowardsLeafNodes, + id: 0, } } + /// Set the expression placement for this UDF, which is used by optimizer rules to determine where in the plan the expression should be placed. + /// This also resets the name of the UDF to a default based on the placement. pub fn with_placement(mut self, placement: ExpressionPlacement) -> Self { self.placement = placement; + self.name = match self.placement { + ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", + ExpressionPlacement::KeepInPlace => "keep_in_place_udf", + ExpressionPlacement::Column => "column_udf", + ExpressionPlacement::Literal => "literal_udf", + } + .to_string(); + self + } + + /// Set the name of the UDF, which is used in the expression and thus in optimizer rules. + pub fn with_name(mut self, name: &str) -> Self { + self.name = name.to_string(); + self + } + + /// Set the id of the UDF. + /// This is an arbitrary made up field to allow creating multiple distinct UDFs with the same placement. + pub fn with_id(mut self, id: usize) -> Self { + self.id = id; self } } @@ -57,12 +88,7 @@ impl ScalarUDFImpl for PlacementTestUDF { self } fn name(&self) -> &str { - match self.placement { - ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", - ExpressionPlacement::KeepInPlace => "keep_in_place_udf", - ExpressionPlacement::Column => "column_udf", - ExpressionPlacement::Literal => "literal_udf", - } + &self.name } fn signature(&self) -> &Signature { &self.signature From 6b99ef249d93bbd11186c19ec78feeccd92ad02b Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Feb 2026 07:53:23 -0500 Subject: [PATCH 4/9] fix --- datafusion/optimizer/src/test/udfs.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/optimizer/src/test/udfs.rs b/datafusion/optimizer/src/test/udfs.rs index f68b1fb33da25..35ea3e44d3e72 100644 --- a/datafusion/optimizer/src/test/udfs.rs +++ b/datafusion/optimizer/src/test/udfs.rs @@ -44,10 +44,10 @@ impl PlacementTestUDF { pub fn new() -> Self { Self { name: "leaf_udf".to_string(), - // Accept any two arguments and return UInt32 for testing purposes. + // Accept any one or two arguments and return UInt32 for testing purposes. // The actual types don't matter since this UDF is not intended for execution. signature: Signature::new( - TypeSignature::Any(2), + TypeSignature::OneOf(vec![TypeSignature::Any(1), TypeSignature::Any(2)]), datafusion_expr::Volatility::Immutable, ), placement: ExpressionPlacement::MoveTowardsLeafNodes, From e195438ee3e7f6809a33b0202266fea76e183b16 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Feb 2026 08:44:03 -0500 Subject: [PATCH 5/9] Merge expression pushdown test udfs, fix re-used optimizer context --- .../optimizer/src/extract_leaf_expressions.rs | 1359 +---------------- 1 file changed, 83 insertions(+), 1276 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 18f7e53b1afe8..c55e9b9168123 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -15,28 +15,23 @@ // specific language governing permissions and limitations // under the License. +//! NB: This module is a work in progress. +//! We merged it early in +//! with the skeleton and snapshots matching the current state, +//! but the actual implementation is pending further development. +//! There may be comments or code that are incomplete or inaccurate. //! Two-pass optimizer pipeline that pushes cheap expressions (like struct field //! access `user['status']`) closer to data sources, enabling early data reduction //! and source-level optimizations (e.g., Parquet column pruning). See //! [`ExtractLeafExpressions`] (pass 1) and [`PushDownLeafProjections`] (pass 2). -use indexmap::{IndexMap, IndexSet}; -use std::collections::HashMap; -use std::sync::Arc; - -use datafusion_common::alias::AliasGenerator; -use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; -use datafusion_common::{Column, DFSchema, Result, qualified_name}; +use datafusion_common::Result; +use datafusion_common::tree_node::Transformed; use datafusion_expr::logical_plan::LogicalPlan; -use datafusion_expr::{Expr, ExpressionPlacement, Projection}; use crate::optimizer::ApplyOrder; -use crate::push_down_filter::replace_cols_by_name; -use crate::utils::has_all_column_refs; use crate::{OptimizerConfig, OptimizerRule}; -const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted"; - /// Extracts `MoveTowardsLeafNodes` sub-expressions from non-projection nodes /// into **extraction projections** (pass 1 of 2). /// @@ -77,8 +72,7 @@ const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted"; /// ``` /// /// **Important:** The `PushDownFilter` rule is aware of projections created by this rule -/// and will not push filters through them. It uses `ExpressionPlacement` to detect -/// `MoveTowardsLeafNodes` expressions and skip filter pushdown past them. +/// and will not push filters through them. See `is_extracted_expr_projection` in utils.rs. #[derive(Default, Debug)] pub struct ExtractLeafExpressions {} @@ -101,443 +95,9 @@ impl OptimizerRule for ExtractLeafExpressions { fn rewrite( &self, plan: LogicalPlan, - config: &dyn OptimizerConfig, + _config: &dyn OptimizerConfig, ) -> Result> { - let alias_generator = config.alias_generator(); - extract_from_plan(plan, alias_generator) - } -} - -/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. -/// -/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes -/// like Join, each extracted sub-expression is routed to the correct input -/// by checking which input's schema contains all of the expression's column -/// references. -fn extract_from_plan( - plan: LogicalPlan, - alias_generator: &Arc, -) -> Result> { - // Only extract from plan types whose output schema is predictable after - // expression rewriting. Nodes like Window derive column names from - // their expressions, so rewriting `get_field` inside a window function - // changes the output schema and breaks the recovery projection. - if !matches!( - &plan, - LogicalPlan::Aggregate(_) - | LogicalPlan::Filter(_) - | LogicalPlan::Sort(_) - | LogicalPlan::Limit(_) - | LogicalPlan::Join(_) - ) { - return Ok(Transformed::no(plan)); - } - - let inputs = plan.inputs(); - if inputs.is_empty() { - return Ok(Transformed::no(plan)); - } - - // Save original output schema before any transformation - let original_schema = Arc::clone(plan.schema()); - - // Clone inputs upfront (before plan is consumed by map_expressions) - let owned_inputs: Vec = inputs.into_iter().cloned().collect(); - - // Build per-input schemas (kept alive for extractor borrows) - let input_schemas: Vec> = owned_inputs - .iter() - .map(|i| Arc::clone(i.schema())) - .collect(); - - // Build per-input extractors - let mut extractors: Vec = input_schemas - .iter() - .map(|schema| LeafExpressionExtractor::new(schema.as_ref(), alias_generator)) - .collect(); - - // Build per-input column sets for routing expressions to the correct input - let input_column_sets: Vec> = input_schemas - .iter() - .map(|schema| schema_columns(schema.as_ref())) - .collect(); - - // Transform expressions via map_expressions with routing - let transformed = plan.map_expressions(|expr| { - routing_extract(expr, &mut extractors, &input_column_sets) - })?; - - // If no expressions were rewritten, nothing was extracted - if !transformed.transformed { - return Ok(transformed); - } - - // Build per-input extraction projections (None means no extractions for that input) - let new_inputs: Vec = owned_inputs - .iter() - .zip(extractors.iter()) - .map(|(input, extractor)| { - let input_arc = Arc::new(input.clone()); - Ok(extractor - .build_extraction_projection(&input_arc)? - .unwrap_or_else(|| input.clone())) - }) - .collect::>>()?; - - // Rebuild and add recovery projection if schema changed - let new_plan = transformed - .data - .with_new_exprs(transformed.data.expressions(), new_inputs)?; - - // Add recovery projection if the output schema changed - let recovered = build_recovery_projection(original_schema.as_ref(), new_plan)?; - - Ok(Transformed::yes(recovered)) -} - -/// Given an expression, returns the index of the input whose columns fully -/// cover the expression's column references. -/// Returns `None` if the expression references columns from multiple inputs. -fn find_owning_input( - expr: &Expr, - input_column_sets: &[std::collections::HashSet], -) -> Option { - input_column_sets - .iter() - .position(|cols| has_all_column_refs(expr, cols)) -} - -/// Walks an expression tree top-down, extracting `MoveTowardsLeafNodes` -/// sub-expressions and routing each to the correct per-input extractor. -fn routing_extract( - expr: Expr, - extractors: &mut [LeafExpressionExtractor], - input_column_sets: &[std::collections::HashSet], -) -> Result> { - expr.transform_down(|e| { - // Skip expressions already aliased with extracted expression pattern - if let Expr::Alias(alias) = &e - && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) - { - return Ok(Transformed { - data: e, - transformed: false, - tnr: TreeNodeRecursion::Jump, - }); - } - - // Don't extract Alias nodes directly — preserve the alias and let - // transform_down recurse into the inner expression - if matches!(&e, Expr::Alias(_)) { - return Ok(Transformed::no(e)); - } - - match e.placement() { - ExpressionPlacement::MoveTowardsLeafNodes => { - if let Some(idx) = find_owning_input(&e, input_column_sets) { - let col_ref = extractors[idx].add_extracted(e)?; - Ok(Transformed::yes(col_ref)) - } else { - // References columns from multiple inputs — cannot extract - Ok(Transformed::no(e)) - } - } - ExpressionPlacement::Column => { - // Track columns that the parent node references so the - // extraction projection includes them as pass-through. - // Without this, the extraction projection would only - // contain __extracted_N aliases, and the parent couldn't - // resolve its other column references. - if let Expr::Column(col) = &e - && let Some(idx) = find_owning_input(&e, input_column_sets) - { - extractors[idx].columns_needed.insert(col.clone()); - } - Ok(Transformed::no(e)) - } - _ => Ok(Transformed::no(e)), - } - }) -} - -/// Returns all columns in the schema (both qualified and unqualified forms) -fn schema_columns(schema: &DFSchema) -> std::collections::HashSet { - schema - .iter() - .flat_map(|(qualifier, field)| { - [ - Column::new(qualifier.cloned(), field.name()), - Column::new_unqualified(field.name()), - ] - }) - .collect() -} - -// ============================================================================= -// Helper Functions for Extraction Targeting -// ============================================================================= - -/// Build a replacement map from a projection: output_column_name -> underlying_expr. -/// -/// This is used to resolve column references through a renaming projection. -/// For example, if a projection has `user AS x`, this maps `x` -> `col("user")`. -fn build_projection_replace_map(projection: &Projection) -> HashMap { - projection - .schema - .iter() - .zip(projection.expr.iter()) - .map(|((qualifier, field), expr)| { - let key = Column::from((qualifier, field)).flat_name(); - (key, expr.clone().unalias()) - }) - .collect() -} - -/// Build a recovery projection to restore the original output schema. -/// -/// After extraction, a node's output schema may differ from the original: -/// -/// - **Schema-preserving nodes** (Filter/Sort/Limit): the extraction projection -/// below adds extra `__extracted_N` columns that bubble up through the node. -/// Recovery selects only the original columns to hide the extras. -/// ```text -/// Original schema: [id, user] -/// After extraction: [__extracted_1, id, user] ← extra column leaked through -/// Recovery: SELECT id, user FROM ... ← hides __extracted_1 -/// ``` -/// -/// - **Schema-defining nodes** (Aggregate): same number of columns but names -/// may differ because extracted aliases replaced the original expressions. -/// Recovery maps positionally, aliasing where names changed. -/// ```text -/// Original: [SUM(user['balance'])] -/// After: [SUM(__extracted_1)] ← name changed -/// Recovery: SUM(__extracted_1) AS "SUM(user['balance'])" -/// ``` -/// -/// - **Schemas identical** → no recovery projection needed. -fn build_recovery_projection( - original_schema: &DFSchema, - input: LogicalPlan, -) -> Result { - let new_schema = input.schema(); - let orig_len = original_schema.fields().len(); - let new_len = new_schema.fields().len(); - - if orig_len == new_len { - // Same number of fields — check if schemas are identical - let schemas_match = original_schema.iter().zip(new_schema.iter()).all( - |((orig_q, orig_f), (new_q, new_f))| { - orig_f.name() == new_f.name() && orig_q == new_q - }, - ); - if schemas_match { - return Ok(input); - } - - // Schema-defining nodes (Projection, Aggregate): names may differ at some positions. - // Map positionally, aliasing where the name changed. - let mut proj_exprs = Vec::with_capacity(orig_len); - for (i, (orig_qualifier, orig_field)) in original_schema.iter().enumerate() { - let (new_qualifier, new_field) = new_schema.qualified_field(i); - if orig_field.name() == new_field.name() && orig_qualifier == new_qualifier { - proj_exprs.push(Expr::from((orig_qualifier, orig_field))); - } else { - let new_col = Expr::Column(Column::from((new_qualifier, new_field))); - proj_exprs.push( - new_col.alias_qualified(orig_qualifier.cloned(), orig_field.name()), - ); - } - } - let projection = Projection::try_new(proj_exprs, Arc::new(input))?; - Ok(LogicalPlan::Projection(projection)) - } else { - // Schema-preserving nodes: new schema has extra extraction columns. - // Original columns still exist by name; select them to hide extras. - let col_exprs: Vec = original_schema.iter().map(Expr::from).collect(); - let projection = Projection::try_new(col_exprs, Arc::new(input))?; - Ok(LogicalPlan::Projection(projection)) - } -} - -/// Collects `MoveTowardsLeafNodes` sub-expressions found during expression -/// tree traversal and can build an extraction projection from them. -/// -/// # Example -/// -/// Given `Filter: user['status'] = 'active' AND user['name'] IS NOT NULL`: -/// - `add_extracted(user['status'])` → stores it, returns `col("__extracted_1")` -/// - `add_extracted(user['name'])` → stores it, returns `col("__extracted_2")` -/// - `build_extraction_projection()` produces: -/// `Projection: user['status'] AS __extracted_1, user['name'] AS __extracted_2, ` -struct LeafExpressionExtractor<'a> { - /// Extracted expressions: maps expression -> alias - extracted: IndexMap, - /// Columns referenced by extracted expressions or the parent node, - /// included as pass-through in the extraction projection. - columns_needed: IndexSet, - /// Input schema - input_schema: &'a DFSchema, - /// Alias generator - alias_generator: &'a Arc, -} - -impl<'a> LeafExpressionExtractor<'a> { - fn new(input_schema: &'a DFSchema, alias_generator: &'a Arc) -> Self { - Self { - extracted: IndexMap::new(), - columns_needed: IndexSet::new(), - input_schema, - alias_generator, - } - } - - /// Adds an expression to extracted set, returns column reference. - fn add_extracted(&mut self, expr: Expr) -> Result { - // Deduplication: reuse existing alias if same expression - if let Some(alias) = self.extracted.get(&expr) { - return Ok(Expr::Column(Column::new_unqualified(alias))); - } - - // Track columns referenced by this expression - for col in expr.column_refs() { - self.columns_needed.insert(col.clone()); - } - - // Generate unique alias - let alias = self.alias_generator.next(EXTRACTED_EXPR_PREFIX); - self.extracted.insert(expr, alias.clone()); - - Ok(Expr::Column(Column::new_unqualified(&alias))) - } - - /// Builds a fresh extraction projection above the given input. - /// - /// Returns `None` if there are no extractions. Otherwise creates a new - /// projection that includes extracted expressions (aliased) plus all - /// input schema columns for pass-through. - fn build_extraction_projection( - &self, - input: &Arc, - ) -> Result> { - if self.extracted.is_empty() { - return Ok(None); - } - let mut proj_exprs = Vec::new(); - for (expr, alias) in self.extracted.iter() { - proj_exprs.push(expr.clone().alias(alias)); - } - for (qualifier, field) in self.input_schema.iter() { - proj_exprs.push(Expr::from((qualifier, field))); - } - Ok(Some(LogicalPlan::Projection(Projection::try_new( - proj_exprs, - Arc::clone(input), - )?))) - } -} - -/// Build an extraction projection above the target node. -/// -/// If the target is an existing projection, merges into it. This requires -/// resolving column references through the projection's rename mapping: -/// if the projection has `user AS u`, and an extracted expression references -/// `u['name']`, we must rewrite it to `user['name']` since the merged -/// projection reads from the same input as the original. -/// -/// Deduplicates by resolved expression equality and adds pass-through -/// columns as needed. Otherwise builds a fresh projection with extracted -/// expressions + ALL input schema columns. -fn build_extraction_projection_impl( - extracted_exprs: &[(Expr, String)], - columns_needed: &IndexSet, - target: &Arc, - target_schema: &DFSchema, -) -> Result { - if let LogicalPlan::Projection(existing) = target.as_ref() { - // Merge into existing projection - let mut proj_exprs = existing.expr.clone(); - - // Build a map of existing expressions (by Expr equality) to their aliases - let existing_extractions: IndexMap = existing - .expr - .iter() - .filter_map(|e| { - if let Expr::Alias(alias) = e - && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) - { - return Some((*alias.expr.clone(), alias.name.clone())); - } - None - }) - .collect(); - - // Resolve column references through the projection's rename mapping - let replace_map = build_projection_replace_map(existing); - - // Add new extracted expressions, resolving column refs through the projection - for (expr, alias) in extracted_exprs { - let resolved = replace_cols_by_name(expr.clone().alias(alias), &replace_map)?; - let resolved_inner = if let Expr::Alias(a) = &resolved { - a.expr.as_ref() - } else { - &resolved - }; - if let Some(existing_alias) = existing_extractions.get(resolved_inner) { - // Same expression already extracted under a different alias — - // add the expression with the new alias so both names are - // available in the output. We can't reference the existing alias - // as a column within the same projection, so we duplicate the - // computation. - if existing_alias != alias { - proj_exprs.push(resolved); - } - } else { - proj_exprs.push(resolved); - } - } - - // Add any new pass-through columns that aren't already in the projection. - // We check against existing.input.schema() (the projection's source) rather - // than target_schema (the projection's output) because columns produced - // by alias expressions (e.g., CSE's __common_expr_N) exist in the output but - // not the input, and cannot be added as pass-through Column references. - let existing_cols: IndexSet = existing - .expr - .iter() - .filter_map(|e| { - if let Expr::Column(c) = e { - Some(c.clone()) - } else { - None - } - }) - .collect(); - - let input_schema = existing.input.schema(); - for col in columns_needed { - let col_expr = Expr::Column(col.clone()); - let resolved = replace_cols_by_name(col_expr, &replace_map)?; - if let Expr::Column(resolved_col) = &resolved - && !existing_cols.contains(resolved_col) - && input_schema.has_column(resolved_col) - { - proj_exprs.push(Expr::Column(resolved_col.clone())); - } - // If resolved to non-column expr, it's already computed by existing projection - } - - Projection::try_new(proj_exprs, Arc::clone(&existing.input)) - } else { - // Build new projection with extracted expressions + all input columns - let mut proj_exprs = Vec::new(); - for (expr, alias) in extracted_exprs { - proj_exprs.push(expr.clone().alias(alias)); - } - for (qualifier, field) in target_schema.iter() { - proj_exprs.push(Expr::from((qualifier, field))); - } - Projection::try_new(proj_exprs, Arc::clone(target)) + Ok(Transformed::no(plan)) } } @@ -595,507 +155,10 @@ impl OptimizerRule for PushDownLeafProjections { fn rewrite( &self, plan: LogicalPlan, - config: &dyn OptimizerConfig, + _config: &dyn OptimizerConfig, ) -> Result> { - let alias_generator = config.alias_generator(); - match try_push_input(&plan, alias_generator)? { - Some(new_plan) => Ok(Transformed::yes(new_plan)), - None => Ok(Transformed::no(plan)), - } - } -} - -/// Attempts to push a projection's extractable expressions further down. -/// -/// Returns `Some(new_subtree)` if the projection was pushed down or merged, -/// `None` if there is nothing to push or the projection sits above a barrier. -fn try_push_input( - input: &LogicalPlan, - alias_generator: &Arc, -) -> Result> { - let LogicalPlan::Projection(proj) = input else { - return Ok(None); - }; - split_and_push_projection(proj, alias_generator) -} - -/// Splits a projection into extractable pieces, pushes them towards leaf -/// nodes, and adds a recovery projection if needed. -/// -/// Handles both: -/// - **Pure extraction projections** (all `__extracted` aliases + columns) -/// - **Mixed projections** (containing `MoveTowardsLeafNodes` sub-expressions) -/// -/// Returns `Some(new_subtree)` if extractions were pushed down, -/// `None` if there is nothing to extract or push. -/// -/// # Example: Mixed Projection -/// -/// ```text -/// Input plan: -/// Projection: user['name'] IS NOT NULL AS has_name, id -/// Filter: ... -/// TableScan -/// -/// Phase 1 (Split): -/// extraction_pairs: [(user['name'], "__extracted_1")] -/// recovery_exprs: [__extracted_1 IS NOT NULL AS has_name, id] -/// -/// Phase 2 (Push): -/// Push extraction projection through Filter toward TableScan -/// -/// Phase 3 (Recovery): -/// Projection: __extracted_1 IS NOT NULL AS has_name, id <-- recovery -/// Filter: ... -/// Projection: user['name'] AS __extracted_1, id <-- extraction (pushed) -/// TableScan -/// ``` -fn split_and_push_projection( - proj: &Projection, - alias_generator: &Arc, -) -> Result> { - let input = &proj.input; - let input_schema = input.schema(); - - // ── Phase 1: Split ────────────────────────────────────────────────── - // For each projection expression, collect extraction pairs and build - // recovery expressions. - // - // Pre-existing `__extracted` aliases are inserted into the extractor's - // `IndexMap` with the **full** `Expr::Alias(…)` as the key, so the - // alias name participates in equality. This prevents collisions when - // CSE rewrites produce the same inner expression under different alias - // names (e.g. `__common_expr_4 AS __extracted_1` and - // `__common_expr_4 AS __extracted_3`). New extractions from - // `routing_extract` use bare (non-Alias) keys and get normal dedup. - // - // When building the final `extraction_pairs`, the Alias wrapper is - // stripped so consumers see the usual `(inner_expr, alias_name)` tuples. - - let mut extractors = vec![LeafExpressionExtractor::new( - input_schema.as_ref(), - alias_generator, - )]; - let input_column_sets = vec![schema_columns(input_schema.as_ref())]; - - let original_schema = proj.schema.as_ref(); - let mut recovery_exprs: Vec = Vec::with_capacity(proj.expr.len()); - let mut needs_recovery = false; - let mut has_new_extractions = false; - - for (expr, (qualifier, field)) in proj.expr.iter().zip(original_schema.iter()) { - if let Expr::Alias(alias) = expr - && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) - { - // Insert the full Alias expression as the key so that - // distinct alias names don't collide in the IndexMap. - let alias_name = alias.name.clone(); - - for col_ref in alias.expr.column_refs() { - extractors[0].columns_needed.insert(col_ref.clone()); - } - - extractors[0] - .extracted - .insert(expr.clone(), alias_name.clone()); - recovery_exprs.push(Expr::Column(Column::new_unqualified(&alias_name))); - } else if let Expr::Column(col) = expr { - // Plain column pass-through — track it in the extractor - extractors[0].columns_needed.insert(col.clone()); - recovery_exprs.push(expr.clone()); - } else { - // Everything else: run through routing_extract - let transformed = - routing_extract(expr.clone(), &mut extractors, &input_column_sets)?; - if transformed.transformed { - has_new_extractions = true; - } - let transformed_expr = transformed.data; - - // Build recovery expression, aliasing back to original name if needed - let original_name = field.name(); - let needs_alias = if let Expr::Column(col) = &transformed_expr { - col.name.as_str() != original_name - } else { - let expr_name = transformed_expr.schema_name().to_string(); - original_name != &expr_name - }; - let recovery_expr = if needs_alias { - needs_recovery = true; - transformed_expr - .clone() - .alias_qualified(qualifier.cloned(), original_name) - } else { - transformed_expr.clone() - }; - - // If the expression was transformed (i.e., has extracted sub-parts), - // it differs from what the pushed projection outputs → needs recovery. - // Also, any non-column, non-__extracted expression needs recovery - // because the pushed extraction projection won't output it directly. - if transformed.transformed || !matches!(expr, Expr::Column(_)) { - needs_recovery = true; - } - - recovery_exprs.push(recovery_expr); - } - } - - // Build extraction_pairs, stripping the Alias wrapper from pre-existing - // entries (they used the full Alias as the map key to avoid dedup). - let extractor = &extractors[0]; - let extraction_pairs: Vec<(Expr, String)> = extractor - .extracted - .iter() - .map(|(e, a)| match e { - Expr::Alias(alias) => (*alias.expr.clone(), a.clone()), - _ => (e.clone(), a.clone()), - }) - .collect(); - let columns_needed = &extractor.columns_needed; - - // If no extractions found, nothing to do - if extraction_pairs.is_empty() { - return Ok(None); - } - - // ── Phase 2: Push down ────────────────────────────────────────────── - let proj_input = Arc::clone(&proj.input); - let pushed = push_extraction_pairs( - &extraction_pairs, - columns_needed, - proj, - &proj_input, - alias_generator, - )?; - - // ── Phase 3: Recovery ─────────────────────────────────────────────── - match (pushed, needs_recovery) { - (Some(pushed_plan), true) => { - // Wrap with recovery projection - let recovery = LogicalPlan::Projection(Projection::try_new( - recovery_exprs, - Arc::new(pushed_plan), - )?); - Ok(Some(recovery)) - } - (Some(pushed_plan), false) => { - // No recovery needed (pure extraction projection) - Ok(Some(pushed_plan)) - } - (None, true) => { - // Push returned None but we still have extractions to apply. - // Build the extraction projection in-place (not pushed) so the - // recovery can resolve extracted expressions. - if !has_new_extractions { - // Only pre-existing __extracted aliases and columns, no new - // extractions from routing_extract. The original projection is - // already an extraction projection that couldn't be pushed - // further. Return None. - return Ok(None); - } - let input_arc = Arc::clone(input); - let extraction = build_extraction_projection_impl( - &extraction_pairs, - columns_needed, - &input_arc, - input_schema.as_ref(), - )?; - let extraction_plan = LogicalPlan::Projection(extraction); - let recovery = LogicalPlan::Projection(Projection::try_new( - recovery_exprs, - Arc::new(extraction_plan), - )?); - Ok(Some(recovery)) - } - (None, false) => { - // No extractions could be pushed and no recovery needed - Ok(None) - } - } -} - -/// Returns true if the plan is a Projection where ALL expressions are either -/// `Alias(EXTRACTED_EXPR_PREFIX, ...)` or `Column`, with at least one extraction. -/// Such projections can safely be pushed further without re-extraction. -fn is_pure_extraction_projection(plan: &LogicalPlan) -> bool { - let LogicalPlan::Projection(proj) = plan else { - return false; - }; - let mut has_extraction = false; - for expr in &proj.expr { - match expr { - Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX) => { - has_extraction = true; - } - Expr::Column(_) => {} - _ => return false, - } - } - has_extraction -} - -/// Pushes extraction pairs down through the projection's input node, -/// dispatching to the appropriate handler based on the input node type. -fn push_extraction_pairs( - pairs: &[(Expr, String)], - columns_needed: &IndexSet, - proj: &Projection, - proj_input: &Arc, - alias_generator: &Arc, -) -> Result> { - match proj_input.as_ref() { - // Merge into existing projection, then try to push the result further down. - // Only merge when all outer expressions are captured (pairs + columns). - // Uncaptured expressions (e.g. `col AS __common_expr_1`) would be lost - // during the merge since build_extraction_projection_impl only knows - // about the captured pairs and columns. - LogicalPlan::Projection(_) - if pairs.len() + columns_needed.len() == proj.expr.len() => - { - let target_schema = Arc::clone(proj_input.schema()); - let merged = build_extraction_projection_impl( - pairs, - columns_needed, - proj_input, - target_schema.as_ref(), - )?; - let merged_plan = LogicalPlan::Projection(merged); - - // After merging, try to push the result further down, but ONLY - // if the merged result is still a pure extraction projection - // (all __extracted aliases + columns). If the merge inherited - // bare MoveTowardsLeafNodes expressions from the inner projection, - // pushing would re-extract them into new aliases and fail when - // the (None, true) fallback can't find the original aliases. - // This handles: Extraction → Recovery(cols) → Filter → ... → TableScan - // by pushing through the recovery projection AND the filter in one pass. - if is_pure_extraction_projection(&merged_plan) - && let Some(pushed) = try_push_input(&merged_plan, alias_generator)? - { - return Ok(Some(pushed)); - } - Ok(Some(merged_plan)) - } - // Generic: handles Filter/Sort/Limit (via recursion), - // SubqueryAlias (with qualifier remap in try_push_into_inputs), - // Join, and anything else. - // Safely bails out for nodes that don't pass through extracted - // columns (Aggregate, Window) via the output schema check. - _ => try_push_into_inputs( - pairs, - columns_needed, - proj_input.as_ref(), - alias_generator, - ), - } -} - -/// Pushes extraction expressions into a node's inputs by routing each -/// expression to the input that owns all of its column references. -/// -/// Works for any number of inputs (1, 2, …N). For single-input nodes, -/// all expressions trivially route to that input. For multi-input nodes -/// (Join, etc.), each expression is routed to the side that owns its columns. -/// -/// Returns `Some(new_node)` if all expressions could be routed AND the -/// rebuilt node's output schema contains all extracted aliases. -/// Returns `None` if any expression references columns from multiple inputs -/// or the node doesn't pass through the extracted columns. -/// -/// # Example: Join with expressions from both sides -/// -/// ```text -/// Extraction projection above a Join: -/// Projection: left.user['name'] AS __extracted_1, right.order['total'] AS __extracted_2, ... -/// Join: left.id = right.user_id -/// TableScan: left [id, user] -/// TableScan: right [user_id, order] -/// -/// After routing each expression to its owning input: -/// Join: left.id = right.user_id -/// Projection: user['name'] AS __extracted_1, id, user <-- left-side extraction -/// TableScan: left [id, user] -/// Projection: order['total'] AS __extracted_2, user_id, order <-- right-side extraction -/// TableScan: right [user_id, order] -/// ``` -fn try_push_into_inputs( - pairs: &[(Expr, String)], - columns_needed: &IndexSet, - node: &LogicalPlan, - alias_generator: &Arc, -) -> Result> { - let inputs = node.inputs(); - if inputs.is_empty() { - return Ok(None); - } - - // SubqueryAlias remaps qualifiers between input and output. - // Rewrite pairs/columns from alias-space to input-space before routing. - let (pairs, columns_needed) = if let LogicalPlan::SubqueryAlias(sa) = node { - let mut replace_map = HashMap::new(); - for ((input_q, input_f), (alias_q, alias_f)) in - sa.input.schema().iter().zip(sa.schema.iter()) - { - replace_map.insert( - qualified_name(alias_q, alias_f.name()), - Expr::Column(Column::new(input_q.cloned(), input_f.name())), - ); - } - let remapped_pairs: Vec<(Expr, String)> = pairs - .iter() - .map(|(expr, alias)| { - Ok(( - replace_cols_by_name(expr.clone(), &replace_map)?, - alias.clone(), - )) - }) - .collect::>()?; - let remapped_columns: IndexSet = columns_needed - .iter() - .filter_map(|col| { - let rewritten = - replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?; - if let Expr::Column(c) = rewritten { - Some(c) - } else { - Some(col.clone()) - } - }) - .collect(); - (remapped_pairs, remapped_columns) - } else { - (pairs.to_vec(), columns_needed.clone()) - }; - let pairs = &pairs[..]; - let columns_needed = &columns_needed; - - let num_inputs = inputs.len(); - - // Build per-input column sets using existing schema_columns() - let input_schemas: Vec> = - inputs.iter().map(|i| Arc::clone(i.schema())).collect(); - let input_column_sets: Vec> = - input_schemas.iter().map(|s| schema_columns(s)).collect(); - - // Route pairs and columns to inputs. - // Union: all inputs share the same schema, so broadcast to every branch. - // Everything else (Join, single-input nodes): columns are disjoint across - // inputs, so route each expression to its owning input. - let broadcast = matches!(node, LogicalPlan::Union(_)); - - let mut per_input_pairs: Vec> = vec![vec![]; num_inputs]; - let mut per_input_columns: Vec> = vec![IndexSet::new(); num_inputs]; - - if broadcast { - // Union output schema and each input schema have the same fields by - // index but may differ in qualifiers (e.g. output `s` vs input - // `simple_struct.s`). Remap pairs/columns to each input's space. - let union_schema = node.schema(); - for (idx, input_schema) in input_schemas.iter().enumerate() { - let mut remap = HashMap::new(); - for ((out_q, out_f), (in_q, in_f)) in - union_schema.iter().zip(input_schema.iter()) - { - remap.insert( - qualified_name(out_q, out_f.name()), - Expr::Column(Column::new(in_q.cloned(), in_f.name())), - ); - } - per_input_pairs[idx] = pairs - .iter() - .map(|(expr, alias)| { - Ok((replace_cols_by_name(expr.clone(), &remap)?, alias.clone())) - }) - .collect::>()?; - per_input_columns[idx] = columns_needed - .iter() - .filter_map(|col| { - let rewritten = - replace_cols_by_name(Expr::Column(col.clone()), &remap).ok()?; - if let Expr::Column(c) = rewritten { - Some(c) - } else { - Some(col.clone()) - } - }) - .collect(); - } - } else { - for (expr, alias) in pairs { - match find_owning_input(expr, &input_column_sets) { - Some(idx) => per_input_pairs[idx].push((expr.clone(), alias.clone())), - None => return Ok(None), // Cross-input expression — bail out - } - } - for col in columns_needed { - let col_expr = Expr::Column(col.clone()); - match find_owning_input(&col_expr, &input_column_sets) { - Some(idx) => { - per_input_columns[idx].insert(col.clone()); - } - None => return Ok(None), // Ambiguous column — bail out - } - } - } - - // Check at least one input has extractions to push - if per_input_pairs.iter().all(|p| p.is_empty()) { - return Ok(None); - } - - // Build per-input extraction projections and push them as far as possible - // immediately. This is critical because map_children preserves cached schemas, - // so if the TopDown pass later pushes a child further (changing its output - // schema), the parent node's schema becomes stale. - let mut new_inputs: Vec = Vec::with_capacity(num_inputs); - for (idx, input) in inputs.into_iter().enumerate() { - if per_input_pairs[idx].is_empty() { - new_inputs.push(input.clone()); - } else { - let input_arc = Arc::new(input.clone()); - let target_schema = Arc::clone(input.schema()); - let proj = build_extraction_projection_impl( - &per_input_pairs[idx], - &per_input_columns[idx], - &input_arc, - target_schema.as_ref(), - )?; - // Verify all requested aliases appear in the projection's output. - // A merge may deduplicate if the same expression already exists - // under a different alias, leaving the requested alias missing. - let proj_schema = proj.schema.as_ref(); - for (_expr, alias) in &per_input_pairs[idx] { - if !proj_schema.fields().iter().any(|f| f.name() == alias) { - return Ok(None); - } - } - let proj_plan = LogicalPlan::Projection(proj); - // Try to push the extraction projection further down within - // this input (e.g., through Filter → existing extraction projection). - // This ensures the input's output schema is stable and won't change - // when the TopDown pass later visits children. - match try_push_input(&proj_plan, alias_generator)? { - Some(pushed) => new_inputs.push(pushed), - None => new_inputs.push(proj_plan), - } - } - } - - // Rebuild the node with new inputs - let new_node = node.with_new_exprs(node.expressions(), new_inputs)?; - - // Safety check: verify all extracted aliases appear in the rebuilt - // node's output schema. Nodes like Aggregate define their own output - // and won't pass through extracted columns — bail out for those. - let output_schema = new_node.schema(); - for (_expr, alias) in pairs { - if !output_schema.fields().iter().any(|f| f.name() == alias) { - return Ok(None); - } + Ok(Transformed::no(plan)) } - - Ok(Some(new_node)) } #[cfg(test)] @@ -1223,20 +286,13 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id - Projection: test.id, test.user - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown (same as after extraction) ## Optimized - Projection: test.id - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -1279,13 +335,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1309,13 +362,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 IS NOT NULL AS has_name - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) IS NOT NULL AS has_name - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1361,10 +411,7 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, test.user - Filter: __datafusion_extracted_1 IS NOT NULL AND __datafusion_extracted_1 IS NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown (same as after extraction) @@ -1387,10 +434,7 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, test.user - Filter: __datafusion_extracted_1 = Utf8("test") - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown (same as after extraction) @@ -1415,19 +459,13 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as original) ## After Pushdown (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1449,10 +487,7 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(mock_leaf(test.user,Utf8("value"))) - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1)]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as original) ## After Pushdown (same as after extraction) @@ -1477,23 +512,13 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: mock_leaf(test.user, Utf8("name")) - Projection: test.user - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as original) ## After Pushdown - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1513,13 +538,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS username - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS username - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1541,17 +563,10 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: test.user, mock_leaf(test.user, Utf8("label")) - Projection: test.user - Filter: __datafusion_extracted_1 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as original) ## After Pushdown - Projection: test.user, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("label")) - Filter: __datafusion_extracted_1 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_2 - TableScan: test projection=[user] + (same as after extraction) ## Optimized (same as after pushdown) @@ -1575,13 +590,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")), mock_leaf(test.user, Utf8("name")) AS name2 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1608,10 +620,7 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) - Sort: test.user ASC NULLS FIRST - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized (same as after pushdown) @@ -1637,16 +646,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) - Limit: skip=0, fetch=10 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) - Limit: skip=0, fetch=10 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1669,9 +672,7 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1) AS cnt]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as original) ## After Pushdown (same as after extraction) @@ -1750,28 +751,13 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, test.user - Filter: __datafusion_extracted_1 IS NOT NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user - Projection: test.id, test.user - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown - Projection: test.id, test.user - Filter: __datafusion_extracted_1 IS NOT NULL - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user - Filter: __datafusion_extracted_1 IS NOT NULL - Projection: test.id, test.user, __datafusion_extracted_1 - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -1793,13 +779,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1869,28 +852,13 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - Projection: test.user - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Projection: __datafusion_extracted_1 - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1910,28 +878,13 @@ mod tests { TableScan: test projection=[a, b, c] ## After Extraction - Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_1 = Int32(2) - Projection: mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c - Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_2 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c - TableScan: test projection=[a, b, c] + (same as original) ## After Pushdown - Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_1 = Int32(2) - Filter: __datafusion_extracted_2 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 - TableScan: test projection=[a, b, c] + (same as after extraction) ## Optimized - Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_1 = Int32(2) - Projection: test.a, test.b, test.c, __datafusion_extracted_1 - Filter: __datafusion_extracted_2 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 - TableScan: test projection=[a, b, c] + (same as after pushdown) "#) } @@ -1973,12 +926,7 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - Projection: test.id, test.user, right.id, right.user - Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user - TableScan: right projection=[id, user] + (same as original) ## After Pushdown (same as after extraction) @@ -2014,11 +962,7 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] - TableScan: right projection=[id, user] + (same as original) ## After Pushdown (same as after extraction) @@ -2055,12 +999,7 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") AND __datafusion_extracted_2 = Utf8("admin") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user - TableScan: right projection=[id, user] + (same as original) ## After Pushdown (same as after extraction) @@ -2128,34 +1067,13 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - Projection: test.id, test.user, right.id, right.user - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user - Projection: test.id, test.user, right.id, right.user - Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user - TableScan: right projection=[id, user] + (same as original) ## After Pushdown - Projection: test.id, test.user, right.id, right.user - Filter: __datafusion_extracted_1 = Utf8("active") - Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user, right.id, right.user - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: test.id, test.user, __datafusion_extracted_1, right.id, right.user - Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2187,20 +1105,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) - Inner Join: test.id = right.id - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) - Inner Join: test.id = right.id - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2229,16 +1137,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) - Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) - Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -2263,16 +1165,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL - Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL - Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -2292,23 +1188,13 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: x - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(x, Utf8("a")) AS __datafusion_extracted_1, x - Projection: test.user AS x - TableScan: test projection=[user] + (same as original) ## After Pushdown - Projection: x - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: x - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -2335,16 +1221,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) - SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) - SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -2366,26 +1246,13 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: mock_leaf(sub.user, Utf8("name")) - Projection: sub.user - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user - SubqueryAlias: sub - TableScan: test projection=[user] + (same as original) ## After Pushdown - Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) - Filter: __datafusion_extracted_1 = Utf8("active") - SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) - Filter: __datafusion_extracted_1 = Utf8("active") - SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -2410,18 +1277,10 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) - SubqueryAlias: outer_sub - SubqueryAlias: inner_sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) - SubqueryAlias: outer_sub - SubqueryAlias: inner_sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -2506,20 +1365,13 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id - Projection: test.id, test.user - Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown (same as after extraction) ## Optimized - Projection: test.id - Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -2544,23 +1396,13 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("name")) - Projection: test.id, test.user - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -2581,23 +1423,13 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("status")) - Projection: test.id, test.user - Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -2634,28 +1466,13 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(right.user, Utf8("status")) - Projection: test.id, test.user, right.id, right.user - Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user - TableScan: right projection=[id, user] + (same as original) ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) - Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) - Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2680,23 +1497,13 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(test.user, Utf8("status")) - Projection: test.id, test.user - Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user - TableScan: test projection=[id, user] + (same as original) ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 - TableScan: test projection=[id, user] + (same as after pushdown) "#) } } From 87222e96bb08338f73bb3a0be75aab8723afbbf6 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Feb 2026 08:47:04 -0500 Subject: [PATCH 6/9] minimze diff --- .../optimizer/src/extract_leaf_expressions.rs | 109 +++++++++--------- datafusion/optimizer/src/test/udfs.rs | 22 +--- 2 files changed, 58 insertions(+), 73 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index c55e9b9168123..d5ff0ebbff5f9 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -177,14 +177,11 @@ mod tests { ScalarUDF, col, lit, logical_plan::builder::LogicalPlanBuilder, }; - fn mock_leaf(expr: Expr, name: &str) -> Expr { + fn leaf_udf(expr: Expr, name: &str) -> Expr { Expr::ScalarFunction(ScalarFunction::new_udf( Arc::new(ScalarUDF::new_from_impl( PlacementTestUDF::new() - .with_placement(ExpressionPlacement::MoveTowardsLeafNodes) - // Use mock_leaf to minimize snapshot churn vs. previous implementation that used a UDF with this name. - // We can remove this name change and accept the snapshot diff in the future. - .with_name("mock_leaf"), + .with_placement(ExpressionPlacement::MoveTowardsLeafNodes), )), vec![expr, lit(name)], )) @@ -270,7 +267,7 @@ mod tests { fn test_extract_from_filter() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan.clone()) - .filter(mock_leaf(col("user"), "status").eq(lit("active")))? + .filter(leaf_udf(col("user"), "status").eq(lit("active")))? .select(vec![ table_scan .schema() @@ -323,7 +320,7 @@ mod tests { fn test_extract_from_projection() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .project(vec![mock_leaf(col("user"), "name")])? + .project(vec![leaf_udf(col("user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -347,7 +344,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ - mock_leaf(col("user"), "name") + leaf_udf(col("user"), "name") .is_not_null() .alias("has_name"), ])? @@ -394,7 +391,7 @@ mod tests { #[test] fn test_filter_with_deduplication() -> Result<()> { let table_scan = test_table_scan_with_struct()?; - let field_access = mock_leaf(col("user"), "name"); + let field_access = leaf_udf(col("user"), "name"); // Filter with the same expression used twice let plan = LogicalPlanBuilder::from(table_scan) .filter( @@ -425,7 +422,7 @@ mod tests { fn test_already_leaf_expression_in_filter() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("user"), "name").eq(lit("test")))? + .filter(leaf_udf(col("user"), "name").eq(lit("test")))? .build()?; assert_stages!(plan, @r#" @@ -450,7 +447,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .aggregate(vec![mock_leaf(col("user"), "status")], vec![count(lit(1))])? + .aggregate(vec![leaf_udf(col("user"), "status")], vec![count(lit(1))])? .build()?; assert_stages!(plan, @r#" @@ -477,7 +474,7 @@ mod tests { let plan = LogicalPlanBuilder::from(table_scan) .aggregate( vec![col("user")], - vec![count(mock_leaf(col("user"), "value"))], + vec![count(leaf_udf(col("user"), "value"))], )? .build()?; @@ -501,8 +498,8 @@ mod tests { fn test_projection_with_filter_combined() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("user"), "status").eq(lit("active")))? - .project(vec![mock_leaf(col("user"), "name")])? + .filter(leaf_udf(col("user"), "status").eq(lit("active")))? + .project(vec![leaf_udf(col("user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -526,7 +523,7 @@ mod tests { fn test_projection_preserves_alias() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .project(vec![mock_leaf(col("user"), "name").alias("username")])? + .project(vec![leaf_udf(col("user"), "name").alias("username")])? .build()?; assert_stages!(plan, @r#" @@ -552,8 +549,8 @@ mod tests { fn test_projection_different_field_from_filter() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("user"), "value").gt(lit(150)))? - .project(vec![col("user"), mock_leaf(col("user"), "label")])? + .filter(leaf_udf(col("user"), "value").gt(lit(150)))? + .project(vec![col("user"), leaf_udf(col("user"), "label")])? .build()?; assert_stages!(plan, @r#" @@ -576,7 +573,7 @@ mod tests { #[test] fn test_projection_deduplication() -> Result<()> { let table_scan = test_table_scan_with_struct()?; - let field = mock_leaf(col("user"), "name"); + let field = leaf_udf(col("user"), "name"); let plan = LogicalPlanBuilder::from(table_scan) .project(vec![field.clone(), field.clone().alias("name2")])? .build()?; @@ -607,7 +604,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .sort(vec![col("user").sort(true, true)])? - .project(vec![mock_leaf(col("user"), "name")])? + .project(vec![leaf_udf(col("user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -633,7 +630,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .limit(0, Some(10))? - .project(vec![mock_leaf(col("user"), "name")])? + .project(vec![leaf_udf(col("user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -662,7 +659,7 @@ mod tests { let plan = LogicalPlanBuilder::from(table_scan) .aggregate( vec![col("user")], - vec![count(mock_leaf(col("user"), "value")).alias("cnt")], + vec![count(leaf_udf(col("user"), "value")).alias("cnt")], )? .build()?; @@ -714,7 +711,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .project(vec![ - mock_leaf(col("user"), "name").alias("__datafusion_extracted_manual"), + leaf_udf(col("user"), "name").alias("__datafusion_extracted_manual"), col("user"), ])? .build()?; @@ -740,8 +737,8 @@ mod tests { fn test_merge_into_existing_extracted_projection() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("user"), "status").eq(lit("active")))? - .filter(mock_leaf(col("user"), "name").is_not_null())? + .filter(leaf_udf(col("user"), "status").eq(lit("active")))? + .filter(leaf_udf(col("user"), "name").is_not_null())? .build()?; assert_stages!(plan, @r#" @@ -767,7 +764,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("user")])? - .project(vec![mock_leaf(col("user"), "name")])? + .project(vec![leaf_udf(col("user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -841,8 +838,8 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("user"), "status").eq(lit("active")))? - .aggregate(vec![mock_leaf(col("user"), "name")], vec![count(lit(1))])? + .filter(leaf_udf(col("user"), "status").eq(lit("active")))? + .aggregate(vec![leaf_udf(col("user"), "name")], vec![count(lit(1))])? .build()?; assert_stages!(plan, @r#" @@ -867,8 +864,8 @@ mod tests { fn test_merge_with_new_columns() -> Result<()> { let table_scan = test_table_scan()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("a"), "x").eq(lit(1)))? - .filter(mock_leaf(col("b"), "y").eq(lit(2)))? + .filter(leaf_udf(col("a"), "x").eq(lit(1)))? + .filter(leaf_udf(col("b"), "y").eq(lit(2)))? .build()?; assert_stages!(plan, @r#" @@ -912,8 +909,8 @@ mod tests { right, JoinType::Inner, ( - vec![mock_leaf(col("user"), "id")], - vec![mock_leaf(col("user"), "id")], + vec![leaf_udf(col("user"), "id")], + vec![leaf_udf(col("user"), "id")], ), None, )? @@ -950,7 +947,7 @@ mod tests { JoinType::Inner, vec![ col("test.user").eq(col("right.user")), - mock_leaf(col("test.user"), "status").eq(lit("active")), + leaf_udf(col("test.user"), "status").eq(lit("active")), ], )? .build()?; @@ -986,8 +983,8 @@ mod tests { JoinType::Inner, vec![ col("test.user").eq(col("right.user")), - mock_leaf(col("test.user"), "status").eq(lit("active")), - mock_leaf(col("right.user"), "role").eq(lit("admin")), + leaf_udf(col("test.user"), "status").eq(lit("active")), + leaf_udf(col("right.user"), "role").eq(lit("admin")), ], )? .build()?; @@ -1051,12 +1048,12 @@ mod tests { right, JoinType::Inner, ( - vec![mock_leaf(col("user"), "id")], - vec![mock_leaf(col("user"), "id")], + vec![leaf_udf(col("user"), "id")], + vec![leaf_udf(col("user"), "id")], ), None, )? - .filter(mock_leaf(col("test.user"), "status").eq(lit("active")))? + .filter(leaf_udf(col("test.user"), "status").eq(lit("active")))? .build()?; assert_stages!(plan, @r#" @@ -1089,8 +1086,8 @@ mod tests { let plan = LogicalPlanBuilder::from(left) .join(right, JoinType::Inner, (vec!["id"], vec!["id"]), None)? .project(vec![ - mock_leaf(col("test.user"), "status"), - mock_leaf(col("right.user"), "role"), + leaf_udf(col("test.user"), "status"), + leaf_udf(col("right.user"), "role"), ])? .build()?; @@ -1123,7 +1120,7 @@ mod tests { let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("user").alias("x")])? .filter(col("x").is_not_null())? - .project(vec![mock_leaf(col("x"), "a")])? + .project(vec![leaf_udf(col("x"), "a")])? .build()?; assert_stages!(plan, @r#" @@ -1151,7 +1148,7 @@ mod tests { let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("user").alias("x")])? .filter(col("x").is_not_null())? - .project(vec![mock_leaf(col("x"), "a").is_not_null()])? + .project(vec![leaf_udf(col("x"), "a").is_not_null()])? .build()?; assert_stages!(plan, @r#" @@ -1178,7 +1175,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .project(vec![col("user").alias("x")])? - .filter(mock_leaf(col("x"), "a").eq(lit("active")))? + .filter(leaf_udf(col("x"), "a").eq(lit("active")))? .build()?; assert_stages!(plan, @r#" @@ -1208,7 +1205,7 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .alias("sub")? - .project(vec![mock_leaf(col("sub.user"), "name")])? + .project(vec![leaf_udf(col("sub.user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -1234,8 +1231,8 @@ mod tests { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) .alias("sub")? - .filter(mock_leaf(col("sub.user"), "status").eq(lit("active")))? - .project(vec![mock_leaf(col("sub.user"), "name")])? + .filter(leaf_udf(col("sub.user"), "status").eq(lit("active")))? + .project(vec![leaf_udf(col("sub.user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -1263,7 +1260,7 @@ mod tests { let plan = LogicalPlanBuilder::from(table_scan) .alias("inner_sub")? .alias("outer_sub")? - .project(vec![mock_leaf(col("outer_sub.user"), "name")])? + .project(vec![leaf_udf(col("outer_sub.user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -1317,13 +1314,11 @@ mod tests { let udf_a = Arc::new(ScalarUDF::new_from_impl( PlacementTestUDF::new() .with_placement(ExpressionPlacement::MoveTowardsLeafNodes) - .with_name("mock_leaf") .with_id(1), )); let udf_b = Arc::new(ScalarUDF::new_from_impl( PlacementTestUDF::new() .with_placement(ExpressionPlacement::MoveTowardsLeafNodes) - .with_name("mock_leaf") .with_id(2), )); @@ -1385,8 +1380,8 @@ mod tests { fn test_extraction_pushdown_through_filter_with_extracted_predicate() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("user"), "status").eq(lit("active")))? - .project(vec![col("id"), mock_leaf(col("user"), "name")])? + .filter(leaf_udf(col("user"), "status").eq(lit("active")))? + .project(vec![col("id"), leaf_udf(col("user"), "name")])? .build()?; assert_stages!(plan, @r#" @@ -1410,7 +1405,7 @@ mod tests { #[test] fn test_extraction_pushdown_same_expr_in_filter_and_projection() -> Result<()> { let table_scan = test_table_scan_with_struct()?; - let field_expr = mock_leaf(col("user"), "status"); + let field_expr = leaf_udf(col("user"), "status"); let plan = LogicalPlanBuilder::from(table_scan) .filter(field_expr.clone().gt(lit(5)))? .project(vec![col("id"), field_expr])? @@ -1448,13 +1443,13 @@ mod tests { JoinType::Left, vec![ col("test.id").eq(col("right.id")), - mock_leaf(col("right.user"), "status").gt(lit(5)), + leaf_udf(col("right.user"), "status").gt(lit(5)), ], )? .project(vec![ col("test.id"), - mock_leaf(col("test.user"), "name"), - mock_leaf(col("right.user"), "status"), + leaf_udf(col("test.user"), "name"), + leaf_udf(col("right.user"), "status"), ])? .build()?; @@ -1482,11 +1477,11 @@ mod tests { fn test_pure_extraction_proj_push_through_filter() -> Result<()> { let table_scan = test_table_scan_with_struct()?; let plan = LogicalPlanBuilder::from(table_scan) - .filter(mock_leaf(col("user"), "status").gt(lit(5)))? + .filter(leaf_udf(col("user"), "status").gt(lit(5)))? .project(vec![ col("id"), - mock_leaf(col("user"), "name"), - mock_leaf(col("user"), "status"), + leaf_udf(col("user"), "name"), + leaf_udf(col("user"), "status"), ])? .build()?; diff --git a/datafusion/optimizer/src/test/udfs.rs b/datafusion/optimizer/src/test/udfs.rs index 35ea3e44d3e72..9164603dba3d5 100644 --- a/datafusion/optimizer/src/test/udfs.rs +++ b/datafusion/optimizer/src/test/udfs.rs @@ -30,7 +30,6 @@ use datafusion_expr::{ pub struct PlacementTestUDF { signature: Signature, placement: ExpressionPlacement, - name: String, id: usize, } @@ -43,7 +42,6 @@ impl Default for PlacementTestUDF { impl PlacementTestUDF { pub fn new() -> Self { Self { - name: "leaf_udf".to_string(), // Accept any one or two arguments and return UInt32 for testing purposes. // The actual types don't matter since this UDF is not intended for execution. signature: Signature::new( @@ -59,19 +57,6 @@ impl PlacementTestUDF { /// This also resets the name of the UDF to a default based on the placement. pub fn with_placement(mut self, placement: ExpressionPlacement) -> Self { self.placement = placement; - self.name = match self.placement { - ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", - ExpressionPlacement::KeepInPlace => "keep_in_place_udf", - ExpressionPlacement::Column => "column_udf", - ExpressionPlacement::Literal => "literal_udf", - } - .to_string(); - self - } - - /// Set the name of the UDF, which is used in the expression and thus in optimizer rules. - pub fn with_name(mut self, name: &str) -> Self { - self.name = name.to_string(); self } @@ -88,7 +73,12 @@ impl ScalarUDFImpl for PlacementTestUDF { self } fn name(&self) -> &str { - &self.name + match self.placement { + ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", + ExpressionPlacement::KeepInPlace => "keep_in_place_udf", + ExpressionPlacement::Column => "column_udf", + ExpressionPlacement::Literal => "literal_udf", + } } fn signature(&self) -> &Signature { &self.signature From 3e75b5bf1d5e182b7ef5e7a5ca6613357db77b88 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Feb 2026 08:47:53 -0500 Subject: [PATCH 7/9] minimze diff --- .../optimizer/src/extract_leaf_expressions.rs | 92 +++++++++---------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index d5ff0ebbff5f9..d04261456d600 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -279,7 +279,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan Projection: test.id - Filter: mock_leaf(test.user, Utf8("status")) = Utf8("active") + Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active") TableScan: test projection=[id, user] ## After Extraction @@ -325,7 +325,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) TableScan: test projection=[user] ## After Extraction @@ -352,7 +352,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) IS NOT NULL AS has_name + Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name TableScan: test projection=[user] ## After Extraction @@ -404,7 +404,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Filter: mock_leaf(test.user, Utf8("name")) IS NOT NULL AND mock_leaf(test.user, Utf8("name")) IS NULL + Filter: leaf_udf(test.user, Utf8("name")) IS NOT NULL AND leaf_udf(test.user, Utf8("name")) IS NULL TableScan: test projection=[id, user] ## After Extraction @@ -427,7 +427,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Filter: mock_leaf(test.user, Utf8("name")) = Utf8("test") + Filter: leaf_udf(test.user, Utf8("name")) = Utf8("test") TableScan: test projection=[id, user] ## After Extraction @@ -452,7 +452,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Aggregate: groupBy=[[mock_leaf(test.user, Utf8("status"))]], aggr=[[COUNT(Int32(1))]] + Aggregate: groupBy=[[leaf_udf(test.user, Utf8("status"))]], aggr=[[COUNT(Int32(1))]] TableScan: test projection=[user] ## After Extraction @@ -480,7 +480,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(mock_leaf(test.user, Utf8("value")))]] + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(leaf_udf(test.user, Utf8("value")))]] TableScan: test projection=[user] ## After Extraction @@ -504,8 +504,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) - Filter: mock_leaf(test.user, Utf8("status")) = Utf8("active") + Projection: leaf_udf(test.user, Utf8("name")) + Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active") TableScan: test projection=[user] ## After Extraction @@ -528,7 +528,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) AS username + Projection: leaf_udf(test.user, Utf8("name")) AS username TableScan: test projection=[user] ## After Extraction @@ -555,8 +555,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: test.user, mock_leaf(test.user, Utf8("label")) - Filter: mock_leaf(test.user, Utf8("value")) > Int32(150) + Projection: test.user, leaf_udf(test.user, Utf8("label")) + Filter: leaf_udf(test.user, Utf8("value")) > Int32(150) TableScan: test projection=[user] ## After Extraction @@ -580,7 +580,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")), mock_leaf(test.user, Utf8("name")) AS name2 + Projection: leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2 TableScan: test projection=[user] ## After Extraction @@ -609,7 +609,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) Sort: test.user ASC NULLS FIRST TableScan: test projection=[user] @@ -635,7 +635,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) Limit: skip=0, fetch=10 TableScan: test projection=[user] @@ -665,7 +665,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(mock_leaf(test.user, Utf8("value"))) AS cnt]] + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(leaf_udf(test.user, Utf8("value"))) AS cnt]] TableScan: test projection=[user] ## After Extraction @@ -718,7 +718,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_manual, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_manual, test.user TableScan: test projection=[user] ## After Extraction @@ -743,8 +743,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Filter: mock_leaf(test.user, Utf8("name")) IS NOT NULL - Filter: mock_leaf(test.user, Utf8("status")) = Utf8("active") + Filter: leaf_udf(test.user, Utf8("name")) IS NOT NULL + Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active") TableScan: test projection=[id, user] ## After Extraction @@ -769,7 +769,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) TableScan: test projection=[user] ## After Extraction @@ -844,8 +844,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Aggregate: groupBy=[[mock_leaf(test.user, Utf8("name"))]], aggr=[[COUNT(Int32(1))]] - Filter: mock_leaf(test.user, Utf8("status")) = Utf8("active") + Aggregate: groupBy=[[leaf_udf(test.user, Utf8("name"))]], aggr=[[COUNT(Int32(1))]] + Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active") TableScan: test projection=[user] ## After Extraction @@ -870,8 +870,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Filter: mock_leaf(test.b, Utf8("y")) = Int32(2) - Filter: mock_leaf(test.a, Utf8("x")) = Int32(1) + Filter: leaf_udf(test.b, Utf8("y")) = Int32(2) + Filter: leaf_udf(test.a, Utf8("x")) = Int32(1) TableScan: test projection=[a, b, c] ## After Extraction @@ -918,7 +918,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Inner Join: mock_leaf(test.user, Utf8("id")) = mock_leaf(right.user, Utf8("id")) + Inner Join: leaf_udf(test.user, Utf8("id")) = leaf_udf(right.user, Utf8("id")) TableScan: test projection=[id, user] TableScan: right projection=[id, user] @@ -954,7 +954,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Inner Join: Filter: test.user = right.user AND mock_leaf(test.user, Utf8("status")) = Utf8("active") + Inner Join: Filter: test.user = right.user AND leaf_udf(test.user, Utf8("status")) = Utf8("active") TableScan: test projection=[id, user] TableScan: right projection=[id, user] @@ -991,7 +991,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Inner Join: Filter: test.user = right.user AND mock_leaf(test.user, Utf8("status")) = Utf8("active") AND mock_leaf(right.user, Utf8("role")) = Utf8("admin") + Inner Join: Filter: test.user = right.user AND leaf_udf(test.user, Utf8("status")) = Utf8("active") AND leaf_udf(right.user, Utf8("role")) = Utf8("admin") TableScan: test projection=[id, user] TableScan: right projection=[id, user] @@ -1058,8 +1058,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Filter: mock_leaf(test.user, Utf8("status")) = Utf8("active") - Inner Join: mock_leaf(test.user, Utf8("id")) = mock_leaf(right.user, Utf8("id")) + Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active") + Inner Join: leaf_udf(test.user, Utf8("id")) = leaf_udf(right.user, Utf8("id")) TableScan: test projection=[id, user] TableScan: right projection=[id, user] @@ -1093,7 +1093,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(test.user, Utf8("status")), mock_leaf(right.user, Utf8("role")) + Projection: leaf_udf(test.user, Utf8("status")), leaf_udf(right.user, Utf8("role")) Inner Join: test.id = right.id TableScan: test projection=[id, user] TableScan: right projection=[id, user] @@ -1125,7 +1125,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(x, Utf8("a")) + Projection: leaf_udf(x, Utf8("a")) Filter: x IS NOT NULL Projection: test.user AS x TableScan: test projection=[user] @@ -1153,7 +1153,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(x, Utf8("a")) IS NOT NULL + Projection: leaf_udf(x, Utf8("a")) IS NOT NULL Filter: x IS NOT NULL Projection: test.user AS x TableScan: test projection=[user] @@ -1180,7 +1180,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Filter: mock_leaf(x, Utf8("a")) = Utf8("active") + Filter: leaf_udf(x, Utf8("a")) = Utf8("active") Projection: test.user AS x TableScan: test projection=[user] @@ -1210,7 +1210,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(sub.user, Utf8("name")) + Projection: leaf_udf(sub.user, Utf8("name")) SubqueryAlias: sub TableScan: test projection=[user] @@ -1237,8 +1237,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(sub.user, Utf8("name")) - Filter: mock_leaf(sub.user, Utf8("status")) = Utf8("active") + Projection: leaf_udf(sub.user, Utf8("name")) + Filter: leaf_udf(sub.user, Utf8("status")) = Utf8("active") SubqueryAlias: sub TableScan: test projection=[user] @@ -1265,7 +1265,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: mock_leaf(outer_sub.user, Utf8("name")) + Projection: leaf_udf(outer_sub.user, Utf8("name")) SubqueryAlias: outer_sub SubqueryAlias: inner_sub TableScan: test projection=[user] @@ -1356,7 +1356,7 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan Projection: test.id - Filter: mock_leaf(test.user, Utf8("field")) = Utf8("a") AND mock_leaf(test.user, Utf8("field")) = Utf8("b") + Filter: leaf_udf(test.user, Utf8("field")) = Utf8("a") AND leaf_udf(test.user, Utf8("field")) = Utf8("b") TableScan: test projection=[id, user] ## After Extraction @@ -1386,8 +1386,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: test.id, mock_leaf(test.user, Utf8("name")) - Filter: mock_leaf(test.user, Utf8("status")) = Utf8("active") + Projection: test.id, leaf_udf(test.user, Utf8("name")) + Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active") TableScan: test projection=[id, user] ## After Extraction @@ -1413,8 +1413,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: test.id, mock_leaf(test.user, Utf8("status")) - Filter: mock_leaf(test.user, Utf8("status")) > Int32(5) + Projection: test.id, leaf_udf(test.user, Utf8("status")) + Filter: leaf_udf(test.user, Utf8("status")) > Int32(5) TableScan: test projection=[id, user] ## After Extraction @@ -1455,8 +1455,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(right.user, Utf8("status")) - Left Join: Filter: test.id = right.id AND mock_leaf(right.user, Utf8("status")) > Int32(5) + Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(right.user, Utf8("status")) + Left Join: Filter: test.id = right.id AND leaf_udf(right.user, Utf8("status")) > Int32(5) TableScan: test projection=[id, user] TableScan: right projection=[id, user] @@ -1487,8 +1487,8 @@ mod tests { assert_stages!(plan, @r#" ## Original Plan - Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(test.user, Utf8("status")) - Filter: mock_leaf(test.user, Utf8("status")) > Int32(5) + Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("status")) + Filter: leaf_udf(test.user, Utf8("status")) > Int32(5) TableScan: test projection=[id, user] ## After Extraction From 0f26d62ca7da231a2c3a80c6020e98a9feadfabc Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Feb 2026 08:48:48 -0500 Subject: [PATCH 8/9] revert --- datafusion/optimizer/src/optimizer.rs | 3 --- datafusion/sqllogictest/test_files/explain.slt | 8 -------- datafusion/sqllogictest/test_files/push_down_filter.slt | 9 ++++----- datafusion/sqllogictest/test_files/struct.slt | 4 ++-- datafusion/sqllogictest/test_files/unnest.slt | 2 +- 5 files changed, 7 insertions(+), 19 deletions(-) diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 118ddef49b7e7..877a84fe4dc14 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -43,7 +43,6 @@ use crate::eliminate_join::EliminateJoin; use crate::eliminate_limit::EliminateLimit; use crate::eliminate_outer_join::EliminateOuterJoin; use crate::extract_equijoin_predicate::ExtractEquijoinPredicate; -use crate::extract_leaf_expressions::{ExtractLeafExpressions, PushDownLeafProjections}; use crate::filter_null_join_keys::FilterNullJoinKeys; use crate::optimize_projections::OptimizeProjections; use crate::optimize_unions::OptimizeUnions; @@ -261,8 +260,6 @@ impl Optimizer { // that might benefit from the following rules Arc::new(EliminateGroupByConstant::new()), Arc::new(CommonSubexprEliminate::new()), - Arc::new(ExtractLeafExpressions::new()), - Arc::new(PushDownLeafProjections::new()), Arc::new(OptimizeProjections::new()), ]; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index c5907d497500e..6f615ec391c9e 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -197,8 +197,6 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE -logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE -logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -221,8 +219,6 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE -logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE -logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true @@ -562,8 +558,6 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE -logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE -logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -586,8 +580,6 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE -logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE -logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index edafcfaa543f2..b1cb354e053e4 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -116,12 +116,11 @@ explain select * from (select column1, unnest(column2) as o from d) where o['a'] ---- physical_plan 01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o] -02)--FilterExec: __datafusion_extracted_1@0 = 1, projection=[column1@1, __unnest_placeholder(d.column2,depth=1)@2] +02)--FilterExec: get_field(__unnest_placeholder(d.column2,depth=1)@1, a) = 1 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------ProjectionExec: expr=[get_field(__unnest_placeholder(d.column2,depth=1)@1, a) as __datafusion_extracted_1, column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as __unnest_placeholder(d.column2,depth=1)] -05)--------UnnestExec -06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] -07)------------DataSourceExec: partitions=1, partition_sizes=[1] +04)------UnnestExec +05)--------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] +06)----------DataSourceExec: partitions=1, partition_sizes=[1] statement ok drop table d; diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 09dd98a50b579..e20815a58c765 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -661,7 +661,7 @@ query TT explain select s['a']['b'] from explain_test; ---- logical_plan -01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) AS explain_test.s[a][b] +01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) 02)--TableScan: explain_test projection=[s] physical_plan 01)ProjectionExec: expr=[get_field(s@0, a, b) as explain_test.s[a][b]] @@ -1666,4 +1666,4 @@ order by id; 3 2 150 statement ok -drop table t_agg_window; +drop table t_agg_window; \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 73aeb6c99d0db..1a6b82020c667 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -666,7 +666,7 @@ explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unn logical_plan 01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 02)--Unnest: lists[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1])|depth=2] structs[] -03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 +03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[] 05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3 06)----------TableScan: recursive_unnest_table projection=[column3] From b410680ac1f6e705b942aef73df0ea19356b63f6 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 10 Feb 2026 08:49:29 -0500 Subject: [PATCH 9/9] revert --- .../sqllogictest/test_files/projection.slt | 2 +- .../test_files/projection_pushdown.slt | 468 ++++++++---------- 2 files changed, 214 insertions(+), 256 deletions(-) diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index c6885ae40b3e9..5a4411233424a 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -244,7 +244,7 @@ query TT explain select column1.c0 from t; ---- logical_plan -01)Projection: get_field(t.column1, Utf8("c0")) AS t.column1[c0] +01)Projection: get_field(t.column1, Utf8("c0")) 02)--TableScan: t projection=[column1] physical_plan 01)ProjectionExec: expr=[get_field(column1@0, c0) as t.column1[c0]] diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 6dfa66cda51c9..50e26b2fb0b85 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -104,7 +104,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet @@ -122,7 +122,7 @@ query TT EXPLAIN SELECT s['label'] FROM simple_struct; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +01)Projection: get_field(simple_struct.s, Utf8("label")) 02)--TableScan: simple_struct projection=[s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -144,7 +144,7 @@ query TT EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -166,7 +166,7 @@ query TT EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; ---- logical_plan -01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) 02)--TableScan: nested_struct projection=[id, nested] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet @@ -186,7 +186,7 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet @@ -208,7 +208,7 @@ query TT EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet @@ -235,14 +235,13 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 02)--Filter: simple_struct.id > Int64(2) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] -02)--FilterExec: id@1 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] +02)--FilterExec: id@0 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -260,14 +259,13 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 02)--Filter: simple_struct.id > Int64(2) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] -02)--FilterExec: id@1 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@0 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -285,14 +283,13 @@ query TT EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; ---- logical_plan -01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[label] -02)--Filter: __datafusion_extracted_1 > Int64(150) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) +02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) +03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] physical_plan -01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as simple_struct.s[label]] -02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1, __datafusion_extracted_2@2] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] +02)--FilterExec: get_field(s@1, value) > 150 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet # Verify correctness query IT @@ -316,7 +313,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -341,7 +338,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -366,7 +363,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] @@ -440,7 +437,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -463,7 +460,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -486,7 +483,7 @@ EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3 ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -509,7 +506,7 @@ EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: nested_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] +02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) 03)----TableScan: nested_struct projection=[id, nested] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -531,7 +528,7 @@ EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -559,15 +556,14 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----Filter: simple_struct.id > Int64(1) -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] -03)----FilterExec: id@1 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] +03)----FilterExec: id@0 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -587,15 +583,14 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----Filter: simple_struct.id > Int64(1) -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] -03)----FilterExec: id@1 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] +03)----FilterExec: id@0 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -613,15 +608,14 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LI ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 03)----Filter: simple_struct.id > Int64(1) -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] -03)----FilterExec: id@1 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] +03)----FilterExec: id@0 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -679,7 +673,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] @@ -705,7 +699,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -729,7 +723,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) AS multi_struct.s[value] + Int64(1) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -753,17 +747,16 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, __datafusion_extracted_1 AS multi_struct.s[value] +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) 03)----Filter: multi_struct.id > Int64(2) -04)------Projection: get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_1, multi_struct.id -05)--------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] +04)------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] 02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as multi_struct.s[value]] -04)------FilterExec: id@1 > 2 +03)----ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as multi_struct.s[value]] +04)------FilterExec: id@0 > 2 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 -06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -781,16 +774,13 @@ query TT EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label']; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS multi_struct.s[label], sum(__datafusion_extracted_2) AS sum(multi_struct.s[value]) -02)--Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[sum(__datafusion_extracted_2)]] -03)----Projection: get_field(multi_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_2 -04)------TableScan: multi_struct projection=[s] +01)Aggregate: groupBy=[[get_field(multi_struct.s, Utf8("label"))]], aggr=[[sum(get_field(multi_struct.s, Utf8("value")))]] +02)--TableScan: multi_struct projection=[s] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as multi_struct.s[label], sum(__datafusion_extracted_2)@1 as sum(multi_struct.s[value])] -02)--AggregateExec: mode=FinalPartitioned, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] -03)----RepartitionExec: partitioning=Hash([__datafusion_extracted_1@0], 4), input_partitions=3 -04)------AggregateExec: mode=Partial, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] -05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet +01)AggregateExec: mode=FinalPartitioned, gby=[multi_struct.s[label]@0 as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] +02)--RepartitionExec: partitioning=Hash([multi_struct.s[label]@0], 4), input_partitions=3 +03)----AggregateExec: mode=Partial, gby=[get_field(s@0, label) as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] +04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[s], file_type=parquet # Verify correctness query TI @@ -819,7 +809,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM nullable_struct; ---- logical_plan -01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) AS nullable_struct.s[value] +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) 02)--TableScan: nullable_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet @@ -841,14 +831,13 @@ query TT EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL; ---- logical_plan -01)Projection: nullable_struct.id, __datafusion_extracted_2 AS nullable_struct.s[label] -02)--Filter: __datafusion_extracted_1 IS NOT NULL -03)----Projection: get_field(nullable_struct.s, Utf8("value")) AS __datafusion_extracted_1, nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) AS __datafusion_extracted_2 -04)------TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) +02)--Filter: get_field(nullable_struct.s, Utf8("value")) IS NOT NULL +03)----TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] physical_plan -01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as nullable_struct.s[label]] -02)--FilterExec: __datafusion_extracted_1@0 IS NOT NULL, projection=[id@1, __datafusion_extracted_2@2] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as nullable_struct.s[label]] +02)--FilterExec: get_field(s@1, value) IS NOT NULL +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, s], file_type=parquet # Verify correctness query IT @@ -867,7 +856,7 @@ EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("value")) + Int64(10) AS simple_struct.s[value] + Int64(10), get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("value")) + Int64(10), get_field(simple_struct.s, Utf8("label")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -965,29 +954,27 @@ EXPLAIN SELECT (id + s['value']) * (id + s['value']) as id_and_value FROM simple ---- logical_plan 01)Projection: __common_expr_1 * __common_expr_1 AS id_and_value -02)--Projection: simple_struct.id + __datafusion_extracted_2 AS __common_expr_1 +02)--Projection: simple_struct.id + get_field(simple_struct.s, Utf8("value")) AS __common_expr_1 03)----Filter: simple_struct.id > Int64(2) -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan 01)ProjectionExec: expr=[__common_expr_1@0 * __common_expr_1@0 as id_and_value] -02)--ProjectionExec: expr=[id@1 + __datafusion_extracted_2@0 as __common_expr_1] -03)----FilterExec: id@1 > 2 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +02)--ProjectionExec: expr=[id@0 + get_field(s@1, value) as __common_expr_1] +03)----FilterExec: id@0 > 2 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] query TT EXPLAIN SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: __datafusion_extracted_1 + __datafusion_extracted_1 AS doubled +01)Projection: get_field(simple_struct.s, Utf8("value")) + get_field(simple_struct.s, Utf8("value")) AS doubled 02)--Filter: simple_struct.id > Int64(2) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 + __datafusion_extracted_1@0 as doubled] -02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@0, value) + get_field(s@0, value) as doubled] +02)--FilterExec: id@0 > 2, projection=[s@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1005,14 +992,13 @@ query TT EXPLAIN SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label] +01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) 02)--Filter: simple_struct.id > Int64(2) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label]] -02)--FilterExec: id@2 > 2, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value], get_field(s@0, label) as simple_struct.s[label]] +02)--FilterExec: id@0 > 2, projection=[s@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query IT @@ -1055,14 +1041,13 @@ query TT EXPLAIN SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score +01)Projection: get_field(simple_struct.s, Utf8("value")) * Int64(2) + CAST(character_length(get_field(simple_struct.s, Utf8("label"))) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score 02)--Filter: simple_struct.id > Int64(1) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 * 2 + CAST(character_length(__datafusion_extracted_2@1) AS Int64) as score] -02)--FilterExec: id@2 > 1, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@0, value) * 2 + CAST(character_length(get_field(s@0, label)) AS Int64) as score] +02)--FilterExec: id@0 > 1, projection=[s@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query I @@ -1088,7 +1073,7 @@ EXPLAIN SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1111,7 +1096,7 @@ EXPLAIN SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100) AS simple_struct.s[value] + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") AS simple_struct.s[label] || Utf8("_test") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1132,14 +1117,13 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 02)--Filter: simple_struct.id > Int64(1) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] -02)--FilterExec: id@1 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] +02)--FilterExec: id@0 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -1152,14 +1136,13 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5); ---- logical_plan -01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] +01)Projection: get_field(simple_struct.s, Utf8("value")) 02)--Filter: simple_struct.id > Int64(1) AND (simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] -02)--FilterExec: id@1 > 1 AND (id@1 < 4 OR id@1 = 5), projection=[__datafusion_extracted_1@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] +02)--FilterExec: id@0 > 1 AND (id@0 < 4 OR id@0 = 5), projection=[s@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] # Verify correctness - should return rows where (id > 1) AND ((id < 4) OR (id = 5)) # That's: id=2,3 (1 1 AND id < 5; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] +01)Projection: get_field(simple_struct.s, Utf8("value")) 02)--Filter: simple_struct.id > Int64(1) AND simple_struct.id < Int64(5) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] -02)--FilterExec: id@1 > 1 AND id@1 < 5, projection=[__datafusion_extracted_1@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] +02)--FilterExec: id@0 > 1 AND id@0 < 5, projection=[s@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] # Verify correctness - should return rows where 1 < id < 5 (id=2,3,4) query I @@ -1195,14 +1177,13 @@ query TT EXPLAIN SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label], simple_struct.id +01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")), simple_struct.id 02)--Filter: simple_struct.id > Int64(1) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label], id@2 as id] -02)--FilterExec: id@2 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label], id@0 as id] +02)--FilterExec: id@0 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness - note that id is now at index 2 in the augmented projection query ITI @@ -1216,14 +1197,13 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4; ---- logical_plan -01)Projection: __datafusion_extracted_2 AS simple_struct.s[value] -02)--Filter: character_length(__datafusion_extracted_1) > Int32(4) -03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2 -04)------TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] +01)Projection: get_field(simple_struct.s, Utf8("value")) +02)--Filter: character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4) +03)----TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_2@0 as simple_struct.s[value]] -02)--FilterExec: character_length(__datafusion_extracted_1@0) > 4, projection=[__datafusion_extracted_2@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet +01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] +02)--FilterExec: character_length(get_field(s@0, label)) > 4 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet # Verify correctness - filter on rows where label length > 4 (all have length 5, except 'one' has 3) # Wait, from the data: alpha(5), beta(4), gamma(5), delta(5), epsilon(7) @@ -1250,13 +1230,12 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: __datafusion_extracted_1 ASC NULLS LAST -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 -04)------TableScan: simple_struct projection=[id, s] +02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST +03)----TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet +02)--SortExec: expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet # Verify correctness query I @@ -1279,13 +1258,13 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label']; ---- logical_plan 01)Projection: simple_struct.id, simple_struct.s[value] -02)--Sort: simple_struct.id ASC NULLS LAST, __datafusion_extracted_1 ASC NULLS LAST -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1 +02)--Sort: simple_struct.id ASC NULLS LAST, get_field(simple_struct.s, Utf8("label")) ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), simple_struct.s 04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id, simple_struct.s[value]@1 as simple_struct.s[value]] -02)--SortExec: expr=[id@0 ASC NULLS LAST, __datafusion_extracted_1@2 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as __datafusion_extracted_1], file_type=parquet +02)--SortExec: expr=[id@0 ASC NULLS LAST, get_field(s@2, label) ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], s], file_type=parquet # Verify correctness query II @@ -1308,13 +1287,12 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: __datafusion_extracted_1 ASC NULLS LAST, fetch=2 -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 -04)------TableScan: simple_struct projection=[id, s] +02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST, fetch=2 +03)----TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: TopK(fetch=2), expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet +02)--SortExec: TopK(fetch=2), expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet # Verify correctness query I @@ -1334,13 +1312,12 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] * 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: __datafusion_extracted_1 * Int64(2) ASC NULLS LAST -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 -04)------TableScan: simple_struct projection=[id, s] +02)--Sort: get_field(simple_struct.s, Utf8("value")) * Int64(2) ASC NULLS LAST +03)----TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[__datafusion_extracted_1@1 * 2 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet +02)--SortExec: expr=[get_field(s@1, value) * 2 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet # Verify correctness query I @@ -1362,7 +1339,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value']; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST, simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] @@ -1413,15 +1390,13 @@ INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10; ---- logical_plan 01)Projection: simple_struct.id, join_right.id -02)--Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 * Int64(10) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s] -05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id -06)------TableScan: join_right projection=[id, s] +02)--Inner Join: get_field(simple_struct.s, Utf8("value")) = get_field(join_right.s, Utf8("level")) * Int64(10) +03)----TableScan: simple_struct projection=[id, s] +04)----TableScan: join_right projection=[id, s] physical_plan -01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__datafusion_extracted_1@0, __datafusion_extracted_2 * Int64(10)@2)], projection=[id@1, id@3] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet +01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(simple_struct.s[value]@2, join_right.s[level] * Int64(10)@2)], projection=[id@0, id@3] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s, get_field(s@1, level) * 10 as join_right.s[level] * Int64(10)], file_type=parquet # Verify correctness - value = level * 10 # simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) @@ -1449,14 +1424,13 @@ WHERE simple_struct.s['value'] > 150; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: __datafusion_extracted_1 > Int64(150) -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] -06)--TableScan: join_right projection=[id] +03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +05)--TableScan: join_right projection=[id] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +02)--FilterExec: get_field(s@1, value) > 150, projection=[id@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet 04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - id matches and value > 150 @@ -1485,19 +1459,17 @@ WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: __datafusion_extracted_1 > Int64(100) -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] -06)--Projection: join_right.id -07)----Filter: __datafusion_extracted_2 > Int64(3) -08)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id -09)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] +03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(100) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] +05)--Projection: join_right.id +06)----Filter: get_field(join_right.s, Utf8("level")) > Int64(3) +07)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet -04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1] -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet +02)--FilterExec: get_field(s@1, value) > 100, projection=[id@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +04)--FilterExec: get_field(s@1, level) > 3, projection=[id@0] +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - id matches, value > 100, and level > 3 # Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250) @@ -1523,17 +1495,15 @@ FROM simple_struct INNER JOIN join_right ON simple_struct.id = join_right.id; ---- logical_plan -01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[label], __datafusion_extracted_2 AS join_right.s[role] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")), get_field(join_right.s, Utf8("role")) 02)--Inner Join: simple_struct.id = join_right.id -03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s] -05)----Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_2, join_right.id -06)------TableScan: join_right projection=[id, s] +03)----TableScan: simple_struct projection=[id, s] +04)----TableScan: join_right projection=[id, s] physical_plan -01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[label], __datafusion_extracted_2@2 as join_right.s[role]] -02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], projection=[__datafusion_extracted_1@0, id@1, __datafusion_extracted_2@2] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, id], file_type=parquet -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] +01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label], get_field(s@2, role) as join_right.s[role]] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@0, s@1, s@3] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness query ITT @@ -1591,20 +1561,17 @@ FROM simple_struct LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5; ---- logical_plan -01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[value], __datafusion_extracted_3 AS join_right.s[level] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(join_right.s, Utf8("level")) 02)--Left Join: simple_struct.id = join_right.id -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id -04)------TableScan: simple_struct projection=[id, s] -05)----Projection: join_right.id, __datafusion_extracted_3 -06)------Filter: __datafusion_extracted_1 > Int64(5) -07)--------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_1, join_right.id, get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_3 -08)----------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] +03)----TableScan: simple_struct projection=[id, s] +04)----Filter: get_field(join_right.s, Utf8("level")) > Int64(5) +05)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] physical_plan -01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_2@0 as simple_struct.s[value], __datafusion_extracted_3@2 as join_right.s[level]] -02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@1, id@0)], projection=[__datafusion_extracted_2@0, id@1, __datafusion_extracted_3@3] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet -04)----FilterExec: __datafusion_extracted_1@0 > 5, projection=[id@1, __datafusion_extracted_3@2] -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet +01)ProjectionExec: expr=[id@1 as id, get_field(s@2, value) as simple_struct.s[value], get_field(s@0, level) as join_right.s[level]] +02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(id@0, id@0)], projection=[s@1, id@2, s@3] +03)----FilterExec: get_field(s@1, level) > 5 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet # Verify correctness - left join with level > 5 condition # Only join_right rows with level > 5 are matched: id=1 (level=10), id=4 (level=8) @@ -1632,15 +1599,14 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] +01)Projection: get_field(simple_struct.s, Utf8("value")) 02)--Filter: simple_struct.id > Int64(2) -03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] -02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] +02)--FilterExec: id@0 > 2, projection=[s@1] 03)----RepartitionExec: partitioning=RoundRobinBatch(32), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] ##################### # Section 14: SubqueryAlias tests @@ -1655,16 +1621,15 @@ query TT EXPLAIN SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS t.s[value] +01)Projection: get_field(t.s, Utf8("value")) 02)--SubqueryAlias: t -03)----Projection: __datafusion_extracted_1 +03)----Projection: simple_struct.s 04)------Filter: simple_struct.id > Int64(2) -05)--------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] -02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] +02)--FilterExec: id@0 > 2, projection=[s@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1680,10 +1645,9 @@ EXPLAIN SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t O ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] +02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) 03)----SubqueryAlias: t -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 -05)--------TableScan: simple_struct projection=[s] +04)------TableScan: simple_struct projection=[s] physical_plan 01)SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as t.s[value], get_field(s@1, label) as t.s[label]], file_type=parquet @@ -1703,17 +1667,16 @@ query TT EXPLAIN SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS u.s[value] +01)Projection: get_field(u.s, Utf8("value")) 02)--SubqueryAlias: u 03)----SubqueryAlias: t -04)------Projection: __datafusion_extracted_1 +04)------Projection: simple_struct.s 05)--------Filter: simple_struct.id > Int64(2) -06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as u.s[value]] -02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[get_field(s@0, value) as u.s[value]] +02)--FilterExec: id@0 > 2, projection=[s@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1730,12 +1693,11 @@ EXPLAIN SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 20 logical_plan 01)SubqueryAlias: t 02)--Projection: simple_struct.id -03)----Filter: __datafusion_extracted_1 > Int64(200) -04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] +03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(200) +04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] physical_plan -01)FilterExec: __datafusion_extracted_1@0 > 200, projection=[id@1] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +01)FilterExec: get_field(s@1, value) > 200, projection=[id@0] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet # Verify correctness query I @@ -1757,24 +1719,22 @@ EXPLAIN SELECT s['value'] FROM ( ) t; ---- logical_plan -01)Projection: __datafusion_extracted_1 AS t.s[value] +01)Projection: get_field(t.s, Utf8("value")) 02)--SubqueryAlias: t 03)----Union -04)------Projection: __datafusion_extracted_1 +04)------Projection: simple_struct.s 05)--------Filter: simple_struct.id <= Int64(3) -06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -08)------Projection: __datafusion_extracted_1 -09)--------Filter: simple_struct.id > Int64(3) -10)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id -11)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +07)------Projection: simple_struct.s +08)--------Filter: simple_struct.id > Int64(3) +09)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan -01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] +01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] 02)--UnionExec -03)----FilterExec: id@1 <= 3, projection=[__datafusion_extracted_1@0] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -05)----FilterExec: id@1 > 3, projection=[__datafusion_extracted_1@0] -06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +03)----FilterExec: id@0 <= 3, projection=[s@1] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +05)----FilterExec: id@0 > 3, projection=[s@1] +06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query I @@ -1800,26 +1760,24 @@ EXPLAIN SELECT s['value'], s['label'] FROM ( ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] +02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) 03)----SubqueryAlias: t 04)------Union -05)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 +05)--------Projection: simple_struct.s 06)----------Filter: simple_struct.id <= Int64(3) -07)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id -08)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -09)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 -10)----------Filter: simple_struct.id > Int64(3) -11)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id -12)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +08)--------Projection: simple_struct.s +09)----------Filter: simple_struct.id > Int64(3) +10)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan 01)SortPreservingMergeExec: [t.s[value]@0 ASC NULLS LAST] 02)--SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value], __datafusion_extracted_2@1 as t.s[label]] +03)----ProjectionExec: expr=[get_field(s@0, value) as t.s[value], get_field(s@0, label) as t.s[label]] 04)------UnionExec -05)--------FilterExec: id@2 <= 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -07)--------FilterExec: id@2 > 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] -08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +05)--------FilterExec: id@0 <= 3, projection=[s@1] +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +07)--------FilterExec: id@0 > 3, projection=[s@1] +08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query IT