diff --git a/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java b/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java index 8071468c0..76b20c8e1 100644 --- a/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java +++ b/isthmus/src/main/java/io/substrait/isthmus/OuterReferenceResolver.java @@ -66,6 +66,27 @@ public RelNode visit(Filter filter) throws RuntimeException { for (CorrelationId id : filter.getVariablesSet()) { nestedDepth.putIfAbsent(id, 0); } + // Also register any CorrelationIds referenced directly in the condition expression. + // This covers partially-decorrelated plans where a Calcite optimizer (e.g. HepPlanner) + // rewrites correlated IN-subqueries into joins but leaves scalar-aggregate subquery + // correlations as RexFieldAccess($corN.col) inside a Filter condition without a + // surrounding Correlate node. In that case getVariablesSet() is empty but the condition + // still contains RexCorrelVariable references whose IDs must be in nestedDepth before + // rexVisitor.visitFieldAccess() is called. + filter + .getCondition() + .accept( + new RexShuttle() { + @Override + public RexNode visitFieldAccess(RexFieldAccess fieldAccess) { + if (fieldAccess.getReferenceExpr() instanceof RexCorrelVariable) { + CorrelationId id = + ((RexCorrelVariable) fieldAccess.getReferenceExpr()).id; + nestedDepth.putIfAbsent(id, 0); + } + return fieldAccess; + } + }); filter.getCondition().accept(rexVisitor); return super.visit(filter); } diff --git a/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java b/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java index a804aafb1..aa151b007 100644 --- a/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java +++ b/isthmus/src/test/java/io/substrait/isthmus/OuterReferenceResolverTest.java @@ -7,6 +7,7 @@ import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rex.RexCorrelVariable; import org.apache.calcite.rex.RexFieldAccess; +import org.apache.calcite.rex.RexNode; import org.apache.calcite.sql.parser.SqlParseException; import org.apache.calcite.tools.RelBuilder; import org.apache.calcite.util.Holder; @@ -142,4 +143,63 @@ void nestedApplyJoinQuery() throws SqlParseException { validateOuterRef(fieldAccessDepthMap, "$cor0", "SS_ITEM_SK", 2); validateOuterRef(fieldAccessDepthMap, "$cor1", "I_ITEM_SK", 1); } + + /** + * Regression test for the partially-decorrelated Filter case. + * + *

When a Calcite optimizer (e.g. HepPlanner) rewrites a correlated IN-subquery into a join, it + * can produce a {@link org.apache.calcite.rel.core.Filter} whose condition contains a {@link + * RexFieldAccess} referencing a {@link RexCorrelVariable} ({@code $cor0.field}), but whose {@link + * RelNode#getVariablesSet()} is empty — because the enclosing {@link + * org.apache.calcite.rel.core.Correlate} node has already been replaced by a regular join. + * + *

Before the fix, {@code OuterReferenceResolver.visit(Filter)} only iterated {@code + * filter.getVariablesSet()}, so the {@link org.apache.calcite.rel.core.CorrelationId} was never + * registered in {@code nestedDepth}. The subsequent {@code rexVisitor.visitFieldAccess()} call + * found {@code !nestedDepth.containsKey(id)} and silently skipped the access, producing an empty + * {@code fieldAccessDepthMap} instead of the expected entry. + * + *

The fix pre-scans the Filter condition for {@link RexCorrelVariable} references and registers + * their IDs before delegating to the rex visitor. + */ + @Test + void filterWithCorrelVariableButEmptyVariablesSet() throws SqlParseException { + // Build the partially-decorrelated pattern: + // JOIN (inner side has a Filter whose condition references $cor0 but variablesSet is empty) + // + // SQL equivalent of what a post-optimisation plan looks like: + // SELECT ss_sold_date_sk FROM store_sales + // JOIN item ON item.i_item_sk = store_sales.ss_item_sk ← decorrelated join + // WHERE item.i_item_sk = $cor0.SS_ITEM_SK ← Filter still has the correl ref + // + // We deliberately call .filter(condition) without passing the CorrelationId so that + // getVariablesSet() returns an empty set, reproducing the post-decorrelation shape. + + final Holder cor0 = Holder.empty(); + + // Push STORE_SALES and capture the correlation variable for it. + tpcDsRelBuilder.scan("tpcds", "STORE_SALES").variable(cor0::set); + + // Push ITEM on top, then build the filter condition while ITEM is the top-of-stack. + // The condition equates item.I_ITEM_SK to the correlated $cor0.SS_ITEM_SK. + tpcDsRelBuilder.scan("tpcds", "ITEM"); + RexNode correlCondition = + tpcDsRelBuilder.equals( + tpcDsRelBuilder.field("I_ITEM_SK"), + tpcDsRelBuilder.field(cor0.get(), "SS_ITEM_SK")); + + final RelNode calciteRel = + tpcDsRelBuilder + // Intentionally NOT passing cor0.get().id to filter() → getVariablesSet() is empty. + .filter(correlCondition) + .join(JoinRelType.INNER, tpcDsRelBuilder.literal(true)) + .project(tpcDsRelBuilder.field("SS_SOLD_DATE_SK")) + .build(); + + // Pre-fix: fieldAccessDepthMap would be empty because the CorrelationId was never registered. + // Post-fix: the Filter condition pre-scan registers $cor0, so the field access is recorded. + final Map fieldAccessDepthMap = buildOuterFieldRefMap(calciteRel); + Assertions.assertEquals(1, fieldAccessDepthMap.size()); + validateOuterRef(fieldAccessDepthMap, "$cor0", "SS_ITEM_SK", 0); + } }