From 7aeb8222bb6912ae95d37e82787d7bdc861b2b0e Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Sun, 21 Jun 2026 12:48:01 -0400 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20contrib=20Delta=20serde=20+=20nativ?= =?UTF-8?q?e=20exec=20=E2=80=94=20end-to-end=20native=20reads=20[Delta=20c?= =?UTF-8?q?ontrib=20split,=20part=204b]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part 4b of the Delta Lake contrib PR breakup (tracking: #4366). The red-to-green moment: a `-Pcontrib-delta` build now does END-TO-END native Delta reads. `CometExecRule`'s scanHandler lookup (wired in part 2) now resolves -- the serde converts the `CometDeltaScanMarker` (planted by part 4a's DeltaScanRule) into a `CometDeltaNativeScanExec` that reads through delta-kernel-rs (parts 3a/3b). - `CometDeltaNativeScan.scala` — the serde: marker -> native scan operator (schema annotation, column mapping, row tracking, partition handling). CDF conversion is deferred to part 5 (the `convertCdf` path is carved out here to avoid a compile dependency on `CometDeltaCdfScanExec`). `ScanImpl` is not redefined — part 4a moved it to `DeltaScanMetadata`. - `CometDeltaNativeScanExec.scala` — the exec (`CometScanWithPlanData`): synthesises file partitions from kernel scan tasks, applies DPP pruning. Interim error semantics (until part 8): the `perPartitionFilePaths` / `FAILED_READ_FILE` plumbing is omitted, so a Delta read failure surfaces as a generic `CometNativeException` (the `CometExecRDD` param defaults to empty). - `Native.scala` — JNI declarations binding the part-3a Rust entry points. - `DeltaPlanDataInjector.scala` — registers under `OpStruct::DELTA_SCAN`; part 1's reflective registry picks it up, so per-partition Delta data is injected at execution. No core / earlier-unit edits — the reflective wiring already reaches the serde + injector the moment these classes land. Tests (gated, end-to-end native reads): CometDeltaNativeSuite (19), CometDeltaColumnMappingSuite (5), CometDeltaFeaturesSuite (8), CometDeltaCoverageSuite (24), CometDeltaColumnMappingPhysicalNameReproSuite (1) — all pass. CometDeltaTestBase re-gains the native-read helpers (kept part 4a's marker helpers that are still used). CometDeltaMarkerSuite updated: with the serde present, a claimed scan now engages `CometDeltaNativeScanExec` (it no longer leaves the marker in the plan), so its assertions moved from marker-presence to native engagement. Verification: gated JVM test-compile, 60 contrib tests across 6 suites, spotless/scalastyle, check-suites, gate-verify (default build still 0 Delta symbols) — all green. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../contrib/delta/CometDeltaNativeScan.scala | 1522 +++++++++++++++++ .../apache/comet/contrib/delta/Native.scala | 82 + .../sql/comet/CometDeltaNativeScanExec.scala | 554 ++++++ .../sql/comet/DeltaPlanDataInjector.scala | 91 + ...aColumnMappingPhysicalNameReproSuite.scala | 53 + .../delta/CometDeltaColumnMappingSuite.scala | 211 +++ .../delta/CometDeltaCoverageSuite.scala | 516 ++++++ .../delta/CometDeltaFeaturesSuite.scala | 269 +++ .../contrib/delta/CometDeltaMarkerSuite.scala | 53 +- .../contrib/delta/CometDeltaNativeSuite.scala | 490 ++++++ .../contrib/delta/CometDeltaTestBase.scala | 97 +- 11 files changed, 3879 insertions(+), 59 deletions(-) create mode 100644 contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala create mode 100644 contrib/delta/src/main/scala/org/apache/comet/contrib/delta/Native.scala create mode 100644 contrib/delta/src/main/scala/org/apache/spark/sql/comet/CometDeltaNativeScanExec.scala create mode 100644 contrib/delta/src/main/scala/org/apache/spark/sql/comet/DeltaPlanDataInjector.scala create mode 100644 contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingPhysicalNameReproSuite.scala create mode 100644 contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingSuite.scala create mode 100644 contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaCoverageSuite.scala create mode 100644 contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaFeaturesSuite.scala create mode 100644 contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaNativeSuite.scala diff --git a/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala b/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala new file mode 100644 index 0000000000..db0fde87e9 --- /dev/null +++ b/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala @@ -0,0 +1,1522 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.delta + +import java.util.Locale + +import scala.collection.mutable.ListBuffer +import scala.jdk.CollectionConverters._ + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{And, BoundReference, Expression, InterpretedPredicate} +import org.apache.spark.sql.comet.{CometDeltaNativeScanExec, CometDeltaScanMarker, CometNativeExec, SerializedPlan} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +import org.apache.comet.{CometConf, ConfigEntry} +// Contrib-private Java proto types generated by protoc-jar-maven-plugin from +// contrib/delta/native/src/proto/delta_operator.proto. The proto declares +// `option java_package = "org.apache.comet.contrib.delta.proto"` so the generated +// outer class lands under a Comet-prefixed Java package. +// Typed Delta proto messages now live in core's operator.proto (alongside IcebergScan) +// instead of a contrib-private proto package. +import org.apache.comet.serde.OperatorOuterClass.{DeltaScan, DeltaScanCommon, DeltaScanTaskList} +import org.apache.comet.objectstore.NativeConfig +import org.apache.comet.serde.{CometOperatorSerde, Compatible, ExprOuterClass, OperatorOuterClass, SupportLevel} +import org.apache.comet.serde.ExprOuterClass.Expr +import org.apache.comet.serde.OperatorOuterClass.Operator +import org.apache.comet.serde.QueryPlanSerde.exprToProto +import org.apache.comet.serde.operator.schema2Proto + +/** + * Validation and serde logic for the native Delta Lake scan. + * + * `convert()` calls `Native.planDeltaScan` to enumerate files via `delta-kernel-rs`, builds the + * `DeltaScanCommon` proto with schemas/filters/options, applies static partition pruning, and + * stashes the task list in a ThreadLocal. `createExec()` retrieves it and builds a + * `CometDeltaNativeScanExec` with split-mode serialization: common data serialized once at + * planning time, per-partition task lists materialized lazily at execution time. DPP filters are + * applied at execution time in the exec's `serializedPartitionData`. + */ +/** + * Delta-scan serde + exec factory. Extends Comet's core `CometOperatorSerde` trait so + * the existing convertToComet path in `CometExecRule` invokes it just like the + * built-in handlers (CometNativeScan, CometIcebergNativeScan, ...). What is NOT here + * is any *extension/discovery* SPI -- core's `CometExecRule` resolves this object via + * `DeltaIntegration.scanHandler` (one reflective class lookup, no ServiceLoader, no + * registry). The wire format is the typed `OpStruct::DeltaScan` variant. + */ +object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] with Logging { + + // Single source of truth for the Spark `_metadata.*` file-level column names the native + // Delta scan synthesises. These were repeated verbatim across several scan-planning + // methods below; defined once here so the emit-name lists can't drift (a dropped name => + // N-1 columns where Spark expected N -- the class of bug behind several CDC/row-tracking + // failures). Mirror the native `META_*` consts in contrib/delta/native/src/synthetic_columns.rs. + private[delta] val SparkFileMetadataNames: Set[String] = Set( + "file_path", + "file_name", + "file_size", + "file_block_start", + "file_block_length", + "file_modification_time") + + // Per-file row-tracking metadata columns (present only on row-tracking-enabled tables). + // `default_row_commit_version` must accompany `base_row_id`, else row-tracking/CDC reads + // see N-1 columns where Spark expected N. + private[delta] val PerFileRowTrackingNames: Set[String] = + Set("base_row_id", "default_row_commit_version") + + // All per-file metadata columns: Spark file metadata + row-tracking. + private[delta] val PerFileMetadataNames: Set[String] = + SparkFileMetadataNames ++ PerFileRowTrackingNames + + /** + * `kind` string for the `ContribOp` envelope this serde produces. The native side's + * `comet-contrib-delta` rlib registers `DeltaScanPlanner` under this same kind via + * `register_contrib_planner(DELTA_SCAN_KIND, ...)` in `contrib/delta/native/src/lib.rs`. Keep + * the two in sync. + */ + val DeltaScanKind: String = "delta-scan" + + /** Private lazy handle to the native library - one instance per JVM. */ + private lazy val nativeLib = new org.apache.comet.contrib.delta.Native() + + // Phase 5: stash the raw task-list bytes between convert() and createExec() + // so the exec can do per-partition splitting at execution time. Single-threaded + // during planning so a simple ThreadLocal is safe. + private val lastTaskListBytes = new ThreadLocal[Array[Byte]]() + + // When a scan projects a per-file `_metadata.file_path` column, `DeltaScanRule` sets + // `oneTaskPerPartition = true` in the marker's `DeltaScanMetadata`. We read it here to (a) skip + // byte-range splitting in splitTasks and (b) emit `oneTaskPerPartition = true` on the + // CometDeltaNativeScanExec so packTasks keeps each task in its own partition -- the native + // plan emits one parquet file-group per file, so multiple files in one Spark partition would + // drop the 2nd+ files' rows. + /** + * True for Delta's MATERIALISED row-tracking column names + * (`_row-id-col-` / `_row-commit-version-col-`). These are real parquet + * columns persisted when a file is rewritten to keep row IDs stable, read from the + * file by name -- NOT synthesised. (Distinct from the logical `row_id` / + * `row_commit_version` synthetic columns, which ARE synthesised from baseRowId + + * row_index when no materialised column exists.) + */ + private[delta] def isMaterializedRowTrackingName(name: String): Boolean = { + val lc = name.toLowerCase(Locale.ROOT) + lc.startsWith("_row-id-col-") || lc.startsWith("_row-commit-version-col-") + } + + // Kernel's marker for a metadata column: field metadata `delta.metadataSpec` whose value is the + // spec's text (e.g. "row_id"). Matches `ColumnMetadataKey::MetadataSpec` + `MetadataColumnSpec` + // in delta-kernel-rs; it rides through Spark's `StructType.json` into kernel's `StructType` serde. + private val KernelMetadataSpecKey = "delta.metadataSpec" + private val KernelRowIdSpec = "row_id" + + /** + * If `f` is a materialised row-ID column (`_row-id-col-*`), return it re-marked as kernel's RowId + * metadata column so kernel resolves it from `delta.rowTracking.materializedRowIdColumnName` by + * name (no column-mapping id/physicalName needed) and generates the value. Otherwise return `f` + * unchanged. Only invoked under active column mapping (see caller). + */ + private def asKernelRowIdMetadataColumnIfMaterialized(f: StructField): StructField = { + if (f.name.toLowerCase(Locale.ROOT).startsWith("_row-id-col-")) { + val md = new org.apache.spark.sql.types.MetadataBuilder() + .withMetadata(f.metadata) + .putString(KernelMetadataSpecKey, KernelRowIdSpec) + .build() + StructField(f.name, org.apache.spark.sql.types.LongType, nullable = true, md) + } else { + f + } + } + + /** + * Translate Delta's `delta.columnMapping.id` metadata key to Spark+parquet's standard + * `parquet.field.id` key on every StructField at every level of nesting. Required for + * column-mapping `id` mode: Delta writes parquet files with `PARQUET:field_id` metadata + * (i.e. the same field IDs it stores in its own metadata), but Spark's + * `ParquetUtils.hasFieldId` -- and therefore Comet's serialisers -- only look at + * `parquet.field.id`. Without this translation, `use_field_id=true` would still find + * no IDs on the Spark schema and silently degrade to name-based matching. + * + * Top-level field metadata gets the new entry merged in via `MetadataBuilder`; nested + * StructTypes recurse; ArrayType and MapType walk into their element/key/value types. + * Fields without `delta.columnMapping.id` are passed through unchanged (e.g. partition + * columns, synthetic row-index columns, struct-leaf fields the metadata strip elided). + */ + /** + * Names that appear in `scan.requiredSchema` but are NOT real parquet columns: Delta/Spark + * synthetic + `_metadata.*` virtual columns synthesised natively after the scan. They must be + * excluded from the kernel read projection (kernel would look for non-existent file columns). + * Mirrors the `syntheticNames` set used later when stripping `required_schema` for the proto. + * Materialised row-tracking columns (`_row-id-col-*` / `_row-commit-version-col-*`) are real + * parquet columns and are deliberately NOT here. + */ + private[delta] val SyntheticReadFieldNames: Set[String] = Set( + DeltaReflection.RowIndexColumnName, + DeltaReflection.TmpMetadataRowIndexColumnName, + DeltaReflection.IsRowDeletedColumnName, + DeltaReflection.RowIdColumnName, + DeltaReflection.RowCommitVersionColumnName, + "file_path", + "file_name", + "file_size", + "file_block_start", + "file_block_length", + "file_modification_time", + "base_row_id", + "default_row_commit_version").map(_.toLowerCase(Locale.ROOT)) + + /** + * The query's data-read columns -- `scan.requiredSchema` minus synthetic/metadata columns + * (partition columns are never in `requiredSchema`: Spark gives the data half) -- serialized as + * an Arrow IPC schema message for the driver's `scan.with_schema(...)`. Pure-logical names at + * every nesting level, so kernel resolves the projected physical names + field-ids itself and + * returns `scan.physical_schema()` / `scan.logical_schema()` for the executor. Empty array when + * there are no data columns to read (partition-/synthetic-only scan) -- the driver then skips the + * projection and the executor drives the row count without a parquet read. + */ + /** + * The kernel-read data-read schema as Delta schema JSON (`StructType.json`) -- the single carrier + * the driver feeds to kernel's `ScanBuilder::with_schema`. It is the query's data columns + * (`requiredSchema` minus synthetic/metadata columns), each drawn from `annotatedSource` when + * present so it carries `delta.columnMapping.physicalName` + `id` at every nesting level (the same + * Delta-JSON format kernel reads from the log). `annotatedSource` should be the ANALYSIS-TIME + * schema, falling back to the live snapshot schema -- so kernel resolves the physical names the + * query was PLANNED with and null-fills columns whose field-id changed since analysis (Delta's + * schema-on-read escape hatch). For a column the source doesn't cover (non-column-mapping tables, + * or no source at all) the `requiredSchema` field is used as-is (no annotations needed). Returns + * `""` for a read with zero data columns (partition-/synthetic-only) -- no `with_schema` then. + */ + private[delta] def dataReadSchemaJson( + annotatedSource: Option[StructType], + requiredSchema: StructType, + partitionSchema: StructType = new StructType(), + rowTrackingActive: Boolean = false): String = { + // `row_id` / `row_commit_version` are SYNTHETIC (kernel doesn't read them; we synthesise from + // baseRowId + row_index) ONLY when row tracking is enabled. With it disabled they are ordinary + // user data columns -- a table may legitimately have a column named `row_id`. The proto + // `required_schema` keeps them in that case (the emit flags are gated on row tracking too), so + // the read schema MUST keep them as well, else the executor sees `required_schema` data columns + // with no kernel schema shipped ("missing kernel data-column schemas"). Mirror the emit gating. + val stripNames = + if (rowTrackingActive) SyntheticReadFieldNames + else + SyntheticReadFieldNames - + DeltaReflection.RowIdColumnName.toLowerCase(Locale.ROOT) - + DeltaReflection.RowCommitVersionColumnName.toLowerCase(Locale.ROOT) + val dataFields = requiredSchema.fields.filterNot(f => + stripNames.contains(f.name.toLowerCase(Locale.ROOT))) + if (dataFields.isEmpty) { + // Zero data columns (partition-only / synthetic-only reads): no kernel read schema; the + // executor drives the row count without a parquet read and the partition columns are filled + // separately. (Kernel can't drive a zero-column scan, so we don't project partitions here.) + "" + } else { + val byName = + annotatedSource.map(_.fields.map(f => f.name.toLowerCase(Locale.ROOT) -> f).toMap) + .getOrElse(Map.empty) + val pick = (f: StructField) => byName.getOrElse(f.name.toLowerCase(Locale.ROOT), f) + // Partition columns aren't in `requiredSchema` (Spark hands us the data half), so append them + // when a `partitionSchema` is supplied -- then kernel's per-file transform INJECTS them (the + // max-kernel path) instead of Comet appending them. Sourced from the annotated schema by name + // so column-mapping physical names / field-ids ride along. The AddFiles route passes an empty + // `partitionSchema` (its identity transform can't inject partitions, so partitions stay + // Comet-appended there until that route also moves to kernel enumeration). + val projected0 = dataFields.map(pick) ++ partitionSchema.fields.map(pick) + // Materialised row-id columns (`_row-id-col-*`, added by OPTIMIZE/UPDATE/MERGE) are matched by + // NAME and carry NO column-mapping annotation. Under ACTIVE column mapping kernel's logical + // with_schema requires both physicalName AND id on every regular field, so shipping the + // materialised column as a plain data field fails ("lacks delta.columnMapping.physicalName/id"). + // The kernel-intended way is to request the RowId METADATA column: mark the field with kernel's + // `delta.metadataSpec` = `row_id` (ColumnMetadataKey::MetadataSpec). Kernel then reads + // `delta.rowTracking.materializedRowIdColumnName` by name (bypassing CM make_physical), adds a + // row_index helper, and emits `GenerateRowId` (coalesce(materialised, baseRowId+row_index)) on + // the per-file transform -- so row_id comes from kernel, correct even under CM-id. Only needed + // under active CM (detected from a real data field carrying a physicalName); plain tables read + // the materialised column fine as a data field, so leave them untouched. RowCommitVersion has + // no kernel metadata-column support (Error::unsupported), so `_row-commit-version-col-*` is left + // as-is. See state_info.rs RowId handling + CometDeltaRowTrackingMaterializedSuite (M3). + val columnMappingActive = + projected0.exists(_.metadata.contains(DeltaReflection.PhysicalNameMetadataKey)) + val projected = + if (columnMappingActive) projected0.map(asKernelRowIdMetadataColumnIfMaterialized) + else projected0 + StructType(projected).json + } + } + + /** + * Kernel read schema for the in-worker synthesis path (`synthesize_in_worker`): data + partitions, + * plus `row_index` as a kernel `RowIndex` metadata column and `row_id` as a kernel `RowId` metadata + * column (kernel injects/generates them). The WORKER-only synthetics -- `is_row_deleted`, + * `row_commit_version`, and Spark `_metadata.*` per-file constants -- are EXCLUDED (kernel doesn't + * read them; the executor produces them). Returns "" when nothing is read from parquet. + */ + private[delta] def synthesizeReadSchemaJson( + annotatedSource: Option[StructType], + requiredSchema: StructType, + partitionSchema: StructType): String = { + val byName = + annotatedSource.map(_.fields.map(f => f.name.toLowerCase(Locale.ROOT) -> f).toMap) + .getOrElse(Map.empty) + val pick = (f: StructField) => byName.getOrElse(f.name.toLowerCase(Locale.ROOT), f) + // Synthetics the executor produces itself (NOT read from kernel). + val workerOnly: Set[String] = Set( + DeltaReflection.IsRowDeletedColumnName, + DeltaReflection.RowCommitVersionColumnName, + "file_path", + "file_name", + "file_size", + "file_block_start", + "file_block_length", + "file_modification_time", + "base_row_id", + "default_row_commit_version").map(_.toLowerCase(Locale.ROOT)) + def isRowIndex(n: String): Boolean = + n.equalsIgnoreCase(DeltaReflection.RowIndexColumnName) || + n.equalsIgnoreCase(DeltaReflection.TmpMetadataRowIndexColumnName) + def isRowId(n: String): Boolean = + n.equalsIgnoreCase(DeltaReflection.RowIdColumnName) || + n.toLowerCase(Locale.ROOT).startsWith("_row-id-col-") + val kept: Array[StructField] = requiredSchema.fields.flatMap { f => + val lc = f.name.toLowerCase(Locale.ROOT) + if (workerOnly.contains(lc)) { + None // worker-side constant; not read from kernel + } else if (isRowIndex(f.name)) { + Some(asKernelMetadataColumn(f.name, "row_index")) + } else if (isRowId(f.name)) { + Some(asKernelMetadataColumn(f.name, KernelRowIdSpec)) + } else { + // Real data column -- includes the MATERIALISED `_row-commit-version-col-*` (kernel has no + // RowCommitVersion metadata column, so it's read from parquet by name, null-filled when a + // file lacks it). + Some(pick(f)) + } + } + val all = kept ++ partitionSchema.fields.map(pick) + if (all.isEmpty) "" else StructType(all).json + } + + /** A LONG field marked as kernel's `` metadata column (`delta.metadataSpec`). */ + private def asKernelMetadataColumn(name: String, spec: String): StructField = { + val md = new org.apache.spark.sql.types.MetadataBuilder() + .putString(KernelMetadataSpecKey, spec) + .build() + StructField(name, org.apache.spark.sql.types.LongType, nullable = true, md) + } + + private[delta] def translateDeltaFieldIdToParquet(field: StructField): StructField = { + val newDataType = translateDataTypeFieldIds(field.dataType) + val newMetadata = + if (field.metadata.contains(DeltaReflection.FieldIdMetadataKey) && + !field.metadata.contains(DeltaReflection.ParquetFieldIdMetadataKey)) { + val fieldId = field.metadata.getLong(DeltaReflection.FieldIdMetadataKey) + new org.apache.spark.sql.types.MetadataBuilder() + .withMetadata(field.metadata) + .putLong(DeltaReflection.ParquetFieldIdMetadataKey, fieldId) + .build() + } else field.metadata + StructField(field.name, newDataType, field.nullable, newMetadata) + } + + private def translateDataTypeFieldIds( + dt: org.apache.spark.sql.types.DataType): org.apache.spark.sql.types.DataType = + dt match { + case s: StructType => StructType(s.fields.map(translateDeltaFieldIdToParquet)) + case a: org.apache.spark.sql.types.ArrayType => + org.apache.spark.sql.types.ArrayType( + translateDataTypeFieldIds(a.elementType), + a.containsNull) + case m: org.apache.spark.sql.types.MapType => + org.apache.spark.sql.types.MapType( + translateDataTypeFieldIds(m.keyType), + translateDataTypeFieldIds(m.valueType), + m.valueContainsNull) + case other => other + } + + private[delta] def scanNeedsOneTaskPerPartition(scan: CometDeltaScanMarker): Boolean = + scan.deltaMetadata.oneTaskPerPartition + + /** + * True when the native plan will emit one parquet file-group per file (core_glue's + * `need_per_file_groups`): any `_metadata.*` virtual column / per-file row-tracking + * constant (`base_row_id`, `default_row_commit_version`) is requested, or a synthesized + * row-index / is-row-deleted / row_id / row_commit_version column is. These are all + * per-file values, so each file becomes its own group. When a Spark partition packs + * several files, those per-file groups execute concurrently and the synthetic-column + * append can mis-align with / drop whole groups (non-deterministically) -- the same class + * of bug fixed for materialised row-tracking columns and `input_file_name()`. Forcing one + * file per partition keeps every native plan single-file-group. See + * CometDeltaDefaultRowCommitVersionReproSuite / DefaultRowCommitVersionSuite, + * [[isMaterializedRowTrackingName]], and `project_concurrent_missing_column_drop`. + */ + private[delta] def needsPerFileGroups(scan: CometDeltaScanMarker): Boolean = { + val outNames = scan.output.map(_.name.toLowerCase(Locale.ROOT)).toSet + val reqNames = scan.requiredSchema.fieldNames.map(_.toLowerCase(Locale.ROOT)).toSet + // `_metadata.*` virtual columns + per-file row-tracking constants (these always force + // per-file groups natively because each carries a per-file value). + val perFileMetadataNames = PerFileMetadataNames + // Synthesized columns (never physical): row index + is-row-deleted. + val syntheticNames = Set( + "__delta_internal_row_index", + "_tmp_metadata_row_index", + "__delta_internal_is_row_deleted") + // row_id / row_commit_version are synthesized (-> per-file) only when row tracking is + // enabled; otherwise they are ordinary user column names (see the emit-flag gating in + // `convert`), so don't force per-file groups for them. + val rowTrackingEnabled = + DeltaReflection.extractMetadataConfiguration(scan.relation).exists { cfg => + cfg.get(DeltaReflection.EnableRowTrackingProp).exists(_.equalsIgnoreCase("true")) || + cfg.contains(DeltaReflection.MaterializedRowIdColumnProp) || + cfg.contains(DeltaReflection.MaterializedRowCommitVersionColumnProp) + } + outNames.exists(perFileMetadataNames.contains) || + reqNames.exists(syntheticNames.contains) || + (rowTrackingEnabled && + (reqNames.contains(DeltaReflection.RowIdColumnName) || + reqNames.contains(DeltaReflection.RowCommitVersionColumnName))) + } + + /** + * Reflectively resolve Hadoop's AWSCredentialProviderList for an s3/s3a URI and merge + * the resulting (access, secret, optional token) triple into `baseOptions` under the + * standard `fs.s3a.access.key` / `fs.s3a.secret.key` / `fs.s3a.session.token` keys -- + * the same keys `NativeConfig.extractObjectStoreOptions` would have picked up if the + * user had set them explicitly in `core-site.xml`. + * + * Reflection is intentional: `hadoop-aws` is an optional dep; on a default Comet + * deployment without S3 support on the classpath, `Class.forName` fails and we return + * the base options unchanged. Non-s3/s3a URIs return base options unchanged too -- + * Azure / GCS / OSS resolve their own credential chains in kernel-rs's object_store + * (or via the static keys already in `baseOptions`). + * + * Skip when the user has already set explicit static keys (don't overwrite an explicit + * config with a resolved IAM-instance token). + * + * If reflection succeeds but credential resolution fails (e.g. IMDS unreachable, no + * provider configured), log a warning and return `baseOptions` -- the engine will + * still try anonymous access or surface a clearer error than a silent crash on first + * S3 read. + */ + // Cached reflective binding for the S3A credential chain. Resolved once per JVM. + // The whole augment path is invoked on every Delta scan -- without caching, each scan + // pays a Class.forName + getMethod round-trip just to find the bridge available. + // + // `None` means we tried once and failed (hadoop-aws not on classpath, signature drift, + // etc.) -- subsequent calls short-circuit. + private case class S3ACredentialBinding( + createProviderList: java.lang.reflect.Method, + getCredentials: java.lang.reflect.Method, + getAccessKey: java.lang.reflect.Method, + getSecretKey: java.lang.reflect.Method, + sessionCredsCls: Option[Class[_]], + getSessionToken: Option[java.lang.reflect.Method]) + + @volatile private var s3aCredentialBindingCache: Option[Option[S3ACredentialBinding]] = None + + private def s3aCredentialBinding: Option[S3ACredentialBinding] = + s3aCredentialBindingCache.getOrElse { + val binding = try { + // scalastyle:off classforname + val utilsCls = Class.forName("org.apache.hadoop.fs.s3a.S3AUtils") + // scalastyle:on classforname + val createMethod = utilsCls.getMethod( + "createAWSCredentialProviderList", + classOf[java.net.URI], + classOf[org.apache.hadoop.conf.Configuration]) + // Resolve the provider-list + credentials methods off the runtime classes + // returned by createAWSCredentialProviderList. Method.invoke walks subclasses, so + // a one-time lookup on the declared return / argument types is enough. + val providerListCls = createMethod.getReturnType + val getCredentialsMethod = providerListCls.getMethod("getCredentials") + val credentialsCls = getCredentialsMethod.getReturnType + val getAccessKeyMethod = credentialsCls.getMethod("getAWSAccessKeyId") + val getSecretKeyMethod = credentialsCls.getMethod("getAWSSecretKey") + val (sessionCredsCls, getSessionTokenMethod) = try { + // scalastyle:off classforname + val cls = Class.forName("com.amazonaws.auth.AWSSessionCredentials") + // scalastyle:on classforname + (Some(cls), Some(cls.getMethod("getSessionToken"))) + } catch { case _: ClassNotFoundException => (None, None) } + Some( + S3ACredentialBinding( + createMethod, + getCredentialsMethod, + getAccessKeyMethod, + getSecretKeyMethod, + sessionCredsCls, + getSessionTokenMethod)) + } catch { + // hadoop-aws not on classpath, or signature drift -- mark as unavailable for the + // rest of the JVM's lifetime. + case _: ClassNotFoundException => None + case _: NoSuchMethodException => None + case scala.util.control.NonFatal(e) => + logWarning( + s"S3A credential-chain reflection lookup failed; falling back to static-only " + + s"keys in Delta log replay: ${e.getMessage}", + e) + None + } + s3aCredentialBindingCache = Some(binding) + binding + } + + private[delta] def augmentWithResolvedAwsCredentials( + baseOptions: Map[String, String], + tableRootUri: java.net.URI, + hadoopConf: org.apache.hadoop.conf.Configuration): Map[String, String] = { + val scheme = Option(tableRootUri.getScheme).map(_.toLowerCase).getOrElse("") + if (scheme != "s3" && scheme != "s3a") return baseOptions + if (baseOptions.contains("fs.s3a.access.key") && + baseOptions.contains("fs.s3a.secret.key")) { + return baseOptions + } + s3aCredentialBinding match { + case None => baseOptions // hadoop-aws not available; nothing to resolve + case Some(binding) => + try { + val providerList = binding.createProviderList.invoke(null, tableRootUri, hadoopConf) + val credentials = binding.getCredentials.invoke(providerList) + val accessKey = binding.getAccessKey.invoke(credentials) + val secretKey = binding.getSecretKey.invoke(credentials) + val sessionToken: Option[String] = (binding.sessionCredsCls, binding.getSessionToken) match { + case (Some(cls), Some(m)) if cls.isInstance(credentials) => + Option(m.invoke(credentials)).map(_.toString) + case _ => None + } + val resolved = scala.collection.mutable.Map[String, String]() ++= baseOptions + Option(accessKey).map(_.toString).filter(_.nonEmpty).foreach { ak => + resolved("fs.s3a.access.key") = ak + } + Option(secretKey).map(_.toString).filter(_.nonEmpty).foreach { sk => + resolved("fs.s3a.secret.key") = sk + } + sessionToken.filter(_.nonEmpty).foreach { st => + resolved("fs.s3a.session.token") = st + } + resolved.toMap + } catch { + case scala.util.control.NonFatal(e) => + logWarning( + s"Delta log-replay credential resolution failed for $tableRootUri: " + + s"${e.getMessage}; falling back to static-only keys in storage options", + e) + baseOptions + } + } + } + + override def enabledConfig: Option[ConfigEntry[Boolean]] = Some( + DeltaConf.COMET_DELTA_NATIVE_ENABLED) + + override def getSupportLevel(operator: CometDeltaScanMarker): SupportLevel = Compatible() + + override def convert( + scan: CometDeltaScanMarker, + builder: Operator.Builder, + childOp: OperatorOuterClass.Operator*): Option[OperatorOuterClass.Operator] = { + + // Resolve the table root via the HadoopFsRelation API - standard Spark, no spark-delta + // compile-time dep required. + val relation = scan.relation + val tableRoot = DeltaReflection.extractTableRoot(relation).getOrElse { + logWarning( + s"CometDeltaNativeScan: unable to extract table root from relation " + + s"${relation.location}; falling back to Spark's Delta reader.") + return None + } + + // Detect Delta synthetic columns the surrounding plan requested. We strip them + // from the proto schemas sent to native so the parquet reader doesn't look for + // columns that don't exist on disk, and set the proto emit flags so the dispatcher + // wraps the parquet scan in `DeltaSyntheticColumnsExec` to append them back. + // - `__delta_internal_row_index` / `__delta_internal_is_row_deleted` are + // UPDATE/DELETE/MERGE internals (#144). + // - `row_id` / `row_commit_version` are row-tracking columns when the table has + // `delta.enableRowTracking=true` but no materialised columns -- synthesised + // from baseRowId + physical row index per task. + // Row index can appear under either name in the scan output: the canonical + // `__delta_internal_row_index` (Delta synthetic-column path), or the + // intermediate `_tmp_metadata_row_index` (Delta's + // `DeltaParquetFileFormat.TMP_METADATA_ROW_INDEX_COLUMN_NAME`, used for plans + // that read `_metadata.row_index` from row-tracking-enabled tables before + // Delta projects the alias). Both cases go through the same native synthesis + // -- just with a different output column name. + val rowIndexCanonicalPresent = scan.requiredSchema.fieldNames.exists( + _.equalsIgnoreCase(DeltaReflection.RowIndexColumnName)) + val rowIndexTmpMetadataPresent = scan.requiredSchema.fieldNames.exists( + _.equalsIgnoreCase(DeltaReflection.TmpMetadataRowIndexColumnName)) + // Both names denote the same physical value (the parquet row index), but they can + // appear together in a single scan: DELETE/UPDATE/MERGE on a DV-enabled table with + // `spark.databricks.delta.deletionVectors.useMetadataRowIndex=false` reads files that + // already carry a deletion vector. There the scan needs `_metadata.row_index` + // (-> `_tmp_metadata_row_index`) to APPLY the existing DV and the explicit + // `__delta_internal_row_index` column to build the NEW DV bitmap. Native synthesis + // emits a single row-index column under one name, and the final-reorder Projection + // names its outputs from the wrapped (native) schema, so it cannot produce two + // distinctly-named row-index outputs. Rather than misname them, fall back to Spark's + // Delta reader for this scan. This shape only arises in Delta's internal DV-maintenance + // read (never a user query), so there is no user-facing perf impact; the common + // useMetadataRowIndex=true path (a single row-index name) is unaffected. + // Repro: CometDeltaDeleteWithDVReproSuite; regression: DeleteSQLWithDeletionVectorsSuite. + if (rowIndexCanonicalPresent && rowIndexTmpMetadataPresent) { + logInfo( + "CometDeltaNativeScan: scan.requiredSchema requests both " + + s"${DeltaReflection.RowIndexColumnName} and " + + s"${DeltaReflection.TmpMetadataRowIndexColumnName} (DV-maintenance read with " + + "useMetadataRowIndex=false); falling back to Spark's Delta reader for this scan.") + return None + } + val emitRowIndex = rowIndexCanonicalPresent || rowIndexTmpMetadataPresent + val rowIndexColumnAlias: String = + if (rowIndexTmpMetadataPresent && !rowIndexCanonicalPresent) + DeltaReflection.TmpMetadataRowIndexColumnName + else "" + val emitIsRowDeleted = scan.requiredSchema.fieldNames.exists( + _.equalsIgnoreCase(DeltaReflection.IsRowDeletedColumnName)) + // `row_id` / `row_commit_version` are reserved names ONLY when row tracking is enabled -- + // then they are metadata columns we synthesize (baseRowId + row_index, etc.). With row + // tracking disabled they are ordinary user column names with no special meaning, and a + // user table may legitimately have a physical column called `row_id`. Deriving the emit + // flags purely from the column name mistook such a user column for the synthetic, stripped + // it from the parquet read, and synthesized garbage (RowIdSuite "row_id column with row ids + // disabled" -> NPE/wrong values). Gate the synthetic emit on row tracking actually being + // enabled on the table. Repro: CometDeltaRowIdColumnCollisionReproSuite. + val rowTrackingEnabled: Boolean = + DeltaReflection.extractMetadataConfiguration(relation).exists { cfg => + cfg.get(DeltaReflection.EnableRowTrackingProp).exists(_.equalsIgnoreCase("true")) || + cfg.contains(DeltaReflection.MaterializedRowIdColumnProp) || + cfg.contains(DeltaReflection.MaterializedRowCommitVersionColumnProp) + } + val emitRowId = rowTrackingEnabled && + scan.requiredSchema.fieldNames.exists(_.equalsIgnoreCase(DeltaReflection.RowIdColumnName)) + val emitRowCommitVersion = rowTrackingEnabled && + scan.requiredSchema.fieldNames.exists( + _.equalsIgnoreCase(DeltaReflection.RowCommitVersionColumnName)) + + val ignoreMissingFiles = + SQLConf.get.ignoreMissingFiles || + relation.options.get("ignoremissingfiles").contains("true") + + // Cloud storage options for kernel log replay and the native parquet reader, + // keyed identically to NativeScan. See `resolveStorageOptions`. + val storageOptions: java.util.Map[String, String] = resolveStorageOptions(scan, tableRoot) + + // Honor Delta's time-travel options (versionAsOf / timestampAsOf) via the Delta- + // resolved snapshot version sitting on the FileIndex. Delta's analysis phase pins + // the exact snapshot before we ever see the plan, so by the time the marker is + // built, `relation.location` is a `PreparedDeltaFileIndex` whose toString looks like + // `Delta[version=0, file:/...]`. We parse the version out via + // `DeltaReflection.extractSnapshotVersion` and pass it through to kernel. + // + // When no version can be extracted (non-Delta file index, parser miss, etc.) we pass + // -1 which asks kernel for the current latest snapshot. + val snapshotVersion: Long = + DeltaReflection.extractSnapshotVersion(relation).getOrElse(-1L) + + // Serialize the data filters so kernel can apply stats-based file pruning during log replay. + val predicateBytes: Array[Byte] = serializeSupportedDataFilters(scan) + + // Stage B/C: produce synthetic columns inside DeltaKernelScanExec (kernel metadata columns + + // engine-side DV-invert / per-file constants), retiring the stacked DeltaSyntheticColumnsExec. + // Needs kernel's per-file transform (row_id GenerateRowId + partition injection), which only the + // kernel-enumeration path ships. Regular reads always take it; DML rewrites (TahoeBatchFileIndex) + // get it via kernel-enumerate + path-filter (set below, only if every touched file matched). + // In-worker synthesis is now the ONLY native synthesis path; the legacy stacked + // DeltaSyntheticColumnsExec is removed (#82). Regular reads always synthesize in-worker; subset + // reads (DML rewrites) do too when every touched file matched kernel enumeration, otherwise they + // decline to vanilla Spark (the `case None` branch below). A read that would have needed the old + // exec therefore either synthesizes in-worker or declines -- it never reaches a non-synthesize + // native path. + var synthesizeInWorker: Boolean = + !DeltaReflection.isSubsetFileIndex(relation.location) + + // Column name list for resolving BoundReference indices to kernel column + // names. Must match the order of scan.output because exprToProto binds + // attribute references by position in that schema. + val columnNames: Array[String] = scan.output.map(_.name).toArray + + // --- 1. Get the active file list. --- + // + // Two code paths: + // (a) Exact-subset FileIndex (`TahoeBatchFileIndex`, `CdcAddFileIndex`, + // `TahoeRemoveFileIndex`, `TahoeChangeFileIndex`): Delta's streaming + // micro-batch reads AND MERGE / UPDATE / DELETE post-join rewrites carry + // an exact `addFiles: Seq[AddFile]` on the FileIndex. Kernel log replay + // against the snapshot would return a DIFFERENT file set (the whole + // snapshot, or a version's deltas), which is a correctness hazard -- + // empty streaming batches, MERGE rewrites that see the whole table + // instead of only touched files. Build the DeltaScanTaskList proto + // directly from those AddFiles, skipping kernel. + // (b) Regular scan against a snapshot (`PreparedDeltaFileIndex` / + // `TahoeLogFileIndex` -- the vast majority): call kernel for log replay. + // Kernel reproduces the pruned active file set from the pinned snapshot + + // the shipped data predicate, and ships its OWN per-file transform so + // partition injection / column-mapping relabel / row-tracking come from + // kernel rather than Comet-side reconstruction. + val taskListBytes = + if (DeltaReflection.isSubsetFileIndex(relation.location)) { + // Pass BOTH the scan's partition filters AND data filters through + // so `refreshedSnapshotFiles` (which queries + // `snapshot.filesForScan(filters, ...)`) re-applies the same + // partition pruning + stats-based data-skipping Delta did at + // planning time. Without this, on `PreparedDeltaFileIndex` the + // refresh path returns ALL files, breaking stats-based file + // pruning (e.g. StatsCollectionSuite "gather stats" -- the + // partition column is `odd` but the test filter is on `id` which + // is a data column; only data-filter skipping makes the assertion + // `recordsScanned(df.where("id = 1")) == 1` hold). + DeltaReflection.extractBatchAddFiles( + relation.location, + scan.partitionFilters ++ scan.dataFilters) match { + case Some(addFiles) => + // DV handling: the driver only ships a DV DESCRIPTOR per AddFile + // (storage type / path / offset / size, KB-scale). The executor decodes + // via `dv_reader::read_dv_indexes` on first poll. Pre-#218 we called + // `materializeDeletedRowIndexes` here and shipped the expanded + // `Vec` -- a single 99M-row DV is a ~1 GB `long[]` retained on the + // driver heap until the scan finishes. Matches the Iceberg contrib's + // `IcebergScanCommon.delete_files_pool` pattern (driver = references, + // executor = decode). If a DV file is missing/corrupt the executor + // surfaces a `SparkException` -- same observable behaviour as before, + // just at execution rather than planning. + // Option (a): DML rewrites (TahoeBatchFileIndex -- touched files are a subset of the + // pinned snapshot) get kernel's per-file transforms by enumerating the snapshot and + // filtering to the touched AddFile paths, so they synthesize in-worker like regular + // reads. Decline to the legacy AddFiles path if ANY touched file isn't in kernel's + // enumeration (safe: the legacy DeltaSyntheticColumnsExec path still works -- never wrong + // files). CDC indexes (files outside the snapshot) are excluded by isDmlRewriteFileIndex. + val dmlSynthBytes: Option[Array[Byte]] = + if (DeltaReflection.isDmlRewriteFileIndex(relation.location)) { + try { + val annotated = scan.deltaMetadata.analyzedSchema.orElse( + DeltaReflection.extractSnapshotSchema(relation)) + val projJson = CometDeltaNativeScan.synthesizeReadSchemaJson( + annotated, + scan.requiredSchema, + relation.partitionSchema) + // Empty predicate: the touched AddFile set is the authoritative selection; kernel + // stats pruning could drop a touched file and force a needless decline. + val kernelTaskList = DeltaScanTaskList.parseFrom( + nativeLib.planDeltaScan( + tableRoot, + snapshotVersion, + storageOptions, + Array.emptyByteArray, + columnNames, + projJson)) + // Resolve touched AddFile paths the SAME way the native side resolves kernel paths + // (table_root + rel, or pass-through for scheme'd paths), then match by file_path. + val sep = if (tableRoot.endsWith("/")) "" else "/" + val touched: Set[String] = addFiles.map { af => + if (af.path.contains(":/")) af.path else tableRoot + sep + af.path + }.toSet + val matched = kernelTaskList.getTasksList.asScala + .filter(t => touched.contains(t.getFilePath)) + if (touched.nonEmpty && matched.size == touched.size) { + Some( + DeltaScanTaskList + .newBuilder() + .setSnapshotVersion(kernelTaskList.getSnapshotVersion) + .setTableRoot(kernelTaskList.getTableRoot) + .addAllUnsupportedFeatures(kernelTaskList.getUnsupportedFeaturesList) + .setPhysicalSchema(kernelTaskList.getPhysicalSchema) + .setLogicalSchema(kernelTaskList.getLogicalSchema) + .addAllTasks(matched.asJava) + .build() + .toByteArray) + } else { + None + } + } catch { + case scala.util.control.NonFatal(e) => + logWarning( + s"CometDeltaNativeScan: DML kernel-enumerate for $tableRoot failed; " + + s"using legacy AddFiles path", + e) + None + } + } else { + None + } + dmlSynthBytes match { + case Some(bytes) => + synthesizeInWorker = true + bytes + case None => + // #82: the legacy buildTaskListFromAddFiles + DeltaSyntheticColumnsExec path is + // retired. The only reads that reached it were CDC-family subset indexes (now read + // natively via kernel TableChanges -- CometDeltaCdfScanExec, #84 -- so they never + // become a marker here) and the rare DML declines that can't synthesize in-worker + // (CM-id materialised row_commit_version; OPTIMIZE file-not-found race). Decline the + // latter to vanilla Spark with the same withFallbackReason mechanism the + // reflection-failure branch below uses (proven to cleanly drop the Comet boundary). + import org.apache.comet.CometSparkSessionExtensions.withFallbackReason + withFallbackReason( + scan, + s"Native Delta scan declines a subset-file-index read that cannot synthesize " + + s"in-worker (${relation.location.getClass.getName}); falling back to Spark.") + return None + } + case None => + // Reflection failed; fall back conservatively. + import org.apache.comet.CometSparkSessionExtensions.withFallbackReason + withFallbackReason( + scan, + s"Native Delta scan could not extract AddFiles from " + + s"${relation.location.getClass.getName}; falling back.") + return None + } + } else { + // Regular reads (`PreparedDeltaFileIndex` / `TahoeLogFileIndex`): kernel enumerates the + // pinned snapshot (reproducing Delta's pruned active file set via the shipped data predicate) + // and ships its OWN per-file transform, so partition injection / column-mapping relabel / + // row-tracking come from kernel. DV-aware INTERNAL reads (Delta-PreprocessTableWithDVs with + // inverted row-index-filter semantics) are kept on vanilla upstream by + // `CometScanRule.scanBelowFallsBackForDvs`. + try { + // The driver's `with_schema` gets the ANALYSIS-TIME read schema (Delta JSON, carrying + // column-mapping physicalName/id) so kernel resolves the names the query was planned with + // -> correct under schema-change-since-analysis. Fall back to the live snapshot schema when + // there's no stashed reference schema; both carry the annotations kernel needs. Include the + // partition schema so kernel's transform INJECTS partition columns (max-kernel) rather than + // Comet appending them. + val annotated = scan.deltaMetadata.analyzedSchema.orElse( + DeltaReflection.extractSnapshotSchema(relation)) + val projJson = + if (synthesizeInWorker) { + // Kernel read = data + partitions + row_index/row_id as kernel metadata columns; the + // executor synthesises is_row_deleted / row_commit_version / _metadata.* itself. + CometDeltaNativeScan.synthesizeReadSchemaJson( + annotated, + scan.requiredSchema, + relation.partitionSchema) + } else { + CometDeltaNativeScan.dataReadSchemaJson( + annotated, + scan.requiredSchema, + relation.partitionSchema, + rowTrackingActive = rowTrackingEnabled) + } + nativeLib.planDeltaScan( + tableRoot, + snapshotVersion, + storageOptions, + predicateBytes, + columnNames, + projJson) + } catch { + case scala.util.control.NonFatal(e) => + logWarning( + s"CometDeltaNativeScan: delta-kernel-rs log replay failed for $tableRoot", + e) + return None + } + } + val taskList = DeltaScanTaskList.parseFrom(taskListBytes) + // Column mapping no longer needs any executor-side plumbing: the kernel-read path ships kernel's + // own `scan.physical_schema()` / `logical_schema()` (physical names + field-ids resolved at every + // nesting level), and kernel returns partition values already translated to logical names + // (driver-side, in `planDeltaScan` / `buildTaskListFromAddFiles`). The former re-derivation of a + // `column_mappings` tree here existed only to feed the removed `physicalise_field` schema rebuild. + + // Phase 6 reader-feature gate. Kernel reports any Delta reader features that + // are currently in use in this snapshot and that Comet's native path does NOT + // correctly handle. Falling back is mandatory for correctness: reading through + // the native path would silently produce wrong results (e.g. returning rows + // that a deletion vector should have hidden). The gate becomes obsolete feature + // by feature as later phases ship: + // deletionVectors -> Phase 3 + // columnMapping -> Phase 4 + // typeWidening -> future phase + // rowTracking -> future phase + val unsupportedFeatures = taskList.getUnsupportedFeaturesList.asScala.toSeq + if (unsupportedFeatures.nonEmpty && + DeltaConf.COMET_DELTA_FALLBACK_ON_UNSUPPORTED_FEATURE.get(scan.conf)) { + logInfo( + s"CometDeltaNativeScan: falling back for table $tableRoot " + + s"due to unsupported reader features: ${unsupportedFeatures.mkString(", ")}") + import org.apache.comet.CometSparkSessionExtensions.withFallbackReason + withFallbackReason( + scan, + s"Native Delta scan does not yet support these features in use on this " + + s"snapshot: ${unsupportedFeatures.mkString(", ")}. Falling back to Spark's " + + s"Delta reader. Set ${DeltaConf.COMET_DELTA_FALLBACK_ON_UNSUPPORTED_FEATURE.key}=false " + + s"to bypass this check (NOT recommended - may produce incorrect results).") + return None + } + + // Apply Spark's partition filters to the task list so that queries like + // `WHERE partition_col = X` don't drag in files from other partitions. Kernel + // itself is given the whole snapshot (no predicate yet - that lands in Phase 2), + // so we do the pruning in Scala by evaluating each task's partition-value map + // against Spark's `partitionFilters`. This is a single driver-side loop; filtered + // tasks never go over the wire to executors. + val filteredTasks0 = + prunePartitions(taskList.getTasksList.asScala.toSeq, scan, relation.partitionSchema) + + // Split files larger than `maxSplitBytes` into byte-range chunks so a single + // big parquet file can be read across multiple Spark partitions, matching + // Spark's `FilePartition.splitFiles` semantics. This is what makes + // FILES_MAX_PARTITION_BYTES, files.openCostInBytes, and + // files.minPartitionNum take effect on Delta tables: without it every file + // is exactly one partition and the *.size assertions in + // DeletionVectorsSuite's PredicatePushdown tests fail (they configure + // FILES_MAX_PARTITION_BYTES=2MB on a multi-row-group fixture and assert + // exactly 2 splits). + val filteredTasks = + splitTasks(scan, filteredTasks0) + + // --- 2. Build the common block --- + val commonBuilder = DeltaScanCommon.newBuilder() + commonBuilder.setSource(scan.simpleStringWithNodeId()) + commonBuilder.setTableRoot(taskList.getTableRoot) + commonBuilder.setSnapshotVersion(taskList.getSnapshotVersion) + commonBuilder.setSessionTimezone(scan.conf.sessionLocalTimeZone) + commonBuilder.setCaseSensitive(scan.conf.getConf[Boolean](SQLConf.CASE_SENSITIVE)) + commonBuilder.setIgnoreMissingFiles(ignoreMissingFiles) + commonBuilder.setDataFileConcurrencyLimit( + DeltaConf.COMET_DELTA_DATA_FILE_CONCURRENCY_LIMIT.get()) + + // `required_schema` on the wire is the SCAN's output schema -- the data columns the scan reads + // from parquet PLUS partition columns it materialises from PartitionedFile.partition_values. + // For non-partitioned tables `scan.requiredSchema` is already the whole output; for partitioned + // tables Spark gives us just the data half, so append the partition fields at the tail (the + // native side splits them back out by name). + val partitionFieldsForRequired: Array[StructField] = { + val haveLc = scan.requiredSchema.fields.map(_.name.toLowerCase(Locale.ROOT)).toSet + relation.partitionSchema.fields.filterNot(f => + haveLc.contains(f.name.toLowerCase(Locale.ROOT))) + } + // Spark `_metadata.*` virtual columns plus Delta row-tracking synthetics that + // appear in scan.output but not scan.requiredSchema. They are synthesised natively + // below (via metadataColumnNamesEmitted) and must appear in the wrapped exec + // output schema for downstream attribute resolution. + val sparkMetadataNameSet = SparkFileMetadataNames + def isExtraSyntheticName(name: String): Boolean = { + val lc = name.toLowerCase(Locale.ROOT) + // NOTE: materialised row-tracking columns (`_row-id-col-*` / + // `_row-commit-version-col-*`) are deliberately NOT here -- they are real + // parquet columns read from the file (added to the data schema), not synthesised. + sparkMetadataNameSet.contains(lc) || + lc == "base_row_id" || + lc == "default_row_commit_version" + } + val extraMetadataFields: Array[StructField] = scan.output.toArray.collect { + case a if isExtraSyntheticName(a.name) && + !scan.requiredSchema.fieldNames.exists(_.equalsIgnoreCase(a.name)) => + StructField(a.name, a.dataType, a.nullable) + } + // Required schema for the proto wire: PURE-LOGICAL at every nesting level. The native + // kernel-read planner physicalises it via the recursive `column_mappings`; partition columns + // are appended at the tail and split back out by name on the native side. + val requiredSchemaLogicalFields = + scan.requiredSchema.fields ++ partitionFieldsForRequired ++ extraMetadataFields + + // Column-mapping `id` mode: Delta stores the parquet field ID on every + // StructField (at every level of nesting) under + // `delta.columnMapping.id`. Spark's `ParquetUtils.hasFieldId` (used by + // `schema2Proto` and the StructType arm of `serializeDataType`) reads from + // `parquet.field.id`. Walk the schema tree and translate keys so the + // native side -- when `use_field_id=true` -- matches Spark schema fields + // to parquet file fields by ID instead of by name. + val cmModeIsId = DeltaReflection + .extractMetadataConfiguration(relation) + .flatMap(_.get("delta.columnMapping.mode")) + .exists(_.equalsIgnoreCase("id")) + // The general-purpose Parquet field-ID read path also drives `use_field_id`: if + // the user has enabled `spark.sql.parquet.fieldId.read.enabled` AND the required + // schema already carries Spark's `parquet.field.id` metadata, route through the + // same native machinery. CM-id mode is the common Delta case; this catches + // non-Delta-id tables that nevertheless want field-ID matching. + val sparkFieldIdReadEnabled = SQLConf.get.getConf(SQLConf.PARQUET_FIELD_ID_READ_ENABLED) && + org.apache.spark.sql.execution.datasources.parquet.ParquetUtils.hasFieldIds( + scan.requiredSchema) + val useFieldIdActive = cmModeIsId || sparkFieldIdReadEnabled + val requiredSchemaForProto = + if (cmModeIsId) { + requiredSchemaLogicalFields.map(CometDeltaNativeScan.translateDeltaFieldIdToParquet) + } else requiredSchemaLogicalFields + val partitionSchemaForProto = + if (cmModeIsId) { + relation.partitionSchema.fields.map( + CometDeltaNativeScan.translateDeltaFieldIdToParquet) + } else relation.partitionSchema.fields + + // Strip Delta synthetic columns from the proto schemas. They're not on disk so the + // native parquet reader must not look for them; `DeltaSyntheticColumnsExec` appends + // them back after the scan. Required precondition: synthetics must be a SUFFIX of + // scan.requiredSchema -- otherwise the appended order wouldn't match Spark's + // expected output. The standard Delta DV-rewrite path satisfies this; anything else + // falls back. If we detect the suffix doesn't hold, decline and let Spark's reader + // handle it (correctness over coverage). + val syntheticNames = Set( + DeltaReflection.RowIndexColumnName.toLowerCase(Locale.ROOT), + DeltaReflection.TmpMetadataRowIndexColumnName.toLowerCase(Locale.ROOT), + DeltaReflection.IsRowDeletedColumnName.toLowerCase(Locale.ROOT), + DeltaReflection.RowIdColumnName, + DeltaReflection.RowCommitVersionColumnName, + // Spark `_metadata.*` virtual columns synthesised natively per-task. + "file_path", + "file_name", + "file_size", + "file_block_start", + "file_block_length", + "file_modification_time", + // Delta row-tracking columns synthesised natively. Both are per-file constants + // from AddFile.baseRowId / AddFile.defaultRowCommitVersion; the materialised + // columns are null when the parquet file doesn't carry them. Must be kept in + // sync with `fixedMetadataNames` below and the proto setters in + // `buildTaskListFromAddFiles` so the native side actually emits these. + "base_row_id", + "default_row_commit_version") + val isSynthetic = (f: StructField) => { + // Materialised row-tracking columns are NOT synthetic -- they are read from parquet, so they + // must stay in the required schema. + syntheticNames.contains(f.name.toLowerCase(Locale.ROOT)) + } + // metadataColumnNames includes the Spark `_metadata.*` virtual columns (file_path, + // file_name, file_size, file_block_start, file_block_length, file_modification_time) + // that Delta's strategies inject. These are synthesised per-task in + // `DeltaSyntheticColumnsExec`, so when any are required we need the synthetic-emit + // path even without emit_row_index/is_row_deleted/row_id/row_commit_version set. + val sparkMetadataNames = SparkFileMetadataNames + val requiredFieldNamesLower: Set[String] = + scan.requiredSchema.fields.map(_.name.toLowerCase(Locale.ROOT)).toSet + // Spark also appends `_metadata.*` columns to scan.output (not requiredSchema) when + // downstream operators (e.g. Delta's PreprocessTableWithDVs) bind to them by name. + // The wrapped exec's output schema must include them so attribute resolution works. + val outputFieldNamesLower: Set[String] = + scan.output.map(_.name.toLowerCase(Locale.ROOT)).toSet + // PerFileMetadataNames includes `default_row_commit_version` alongside `base_row_id`: + // dropping it makes the emit-name list short a column, so CDC / row-tracking reads see + // N-1 cols where Spark expected N (notably under coordinated-commits backfill). + val fixedMetadataNames = PerFileMetadataNames + // The wrapped exec output is `parquet projection ++ row_index/is_row_deleted/... + // ++ metadata_column_names` in the order metadata names are emitted. To make the + // post-synthesis layout match scan.output WITHOUT a final reorder Project, walk + // scan.output and pick out the metadata-style columns in the order they appear. + val metadataColumnNamesEmitted: Seq[String] = scan.output.flatMap { attr => + val lc = attr.name.toLowerCase(Locale.ROOT) + // Materialised row-tracking columns are read from parquet, not synthesised, so + // they are excluded here. + if (fixedMetadataNames.contains(lc)) Some(lc) else None + }.distinct + val needsMetadataEmit = metadataColumnNamesEmitted.nonEmpty + val needsSyntheticEmit = + emitRowIndex || emitIsRowDeleted || emitRowId || emitRowCommitVersion || needsMetadataEmit + // When synthetics are NOT a contiguous suffix of required_schema, build a reorder + // map: for each original required-schema position, an index into the wrapped exec's + // output (parquet output cols followed by appended synthetics in canonical order + // row_index, is_row_deleted, row_id, row_commit_version). The native dispatcher + // applies a final ProjectionExec to reorder columns to match Spark's expected + // output layout. Empty when synthetics ARE a suffix -- already in the right order. + // In synthesize mode the executor assembles the full output BY NAME, so no positional reorder is + // needed and `required_schema` IS the full output (= scan.output) -- synthetics are NOT stripped. + val finalOutputIndices: Seq[Int] = + if (synthesizeInWorker) Seq.empty + else + computeFinalOutputIndices( + needsSyntheticEmit, + requiredSchemaForProto, + isSynthetic, + emitRowIndex, + emitIsRowDeleted, + emitRowId, + emitRowCommitVersion, + rowIndexColumnAlias, + metadataColumnNamesEmitted) + val requiredSchemaForProtoStripped = + if (synthesizeInWorker) { + // Full output the executor must emit (data + partitions + ALL synthetics) in scan.output + // order; the by-name assembler places each column. Logical names (no id-translation -- field + // ids ride on the kernel READ schema, not the output). + scan.output.map(a => StructField(a.name, a.dataType, a.nullable)).toArray + } else if (needsSyntheticEmit) { + requiredSchemaForProto.filterNot(isSynthetic) + } else { + requiredSchemaForProto + } + + val requiredSchema = schema2Proto(requiredSchemaForProtoStripped) + val partitionSchema = schema2Proto(partitionSchemaForProto) + commonBuilder.addAllRequiredSchema(requiredSchema.toIterable.asJava) + commonBuilder.addAllPartitionSchema(partitionSchema.toIterable.asJava) + // Kernel-built projected schemas (`scan.physical_schema()` / `scan.logical_schema()`, Arrow + // IPC) -- correct physical names + field-ids at EVERY nesting level. The executor's kernel-read + // planner uses them verbatim. The kernel-driver `planDeltaScan` path returns them inline; the + // batch-file-index path (file list from AddFiles) fetches them via the schema-only + // `planDeltaReadSchemas`. (For a read with zero data columns there are none, and none are + // needed -- the executor drives the row count without a parquet read.) + val kernelSchemaSource: DeltaScanTaskList = + if (!taskList.getPhysicalSchema.isEmpty) { + taskList + } else { + // Analysis-time read schema (Delta JSON), falling back to the live snapshot schema. Empty => + // zero data columns => no kernel schemas needed. + val projJson = CometDeltaNativeScan.dataReadSchemaJson( + scan.deltaMetadata.analyzedSchema.orElse(DeltaReflection.extractSnapshotSchema(relation)), + scan.requiredSchema, + rowTrackingActive = rowTrackingEnabled) + if (projJson.isEmpty) { + taskList + } else { + try { + DeltaScanTaskList.parseFrom( + nativeLib + .planDeltaReadSchemas(tableRoot, snapshotVersion, storageOptions, projJson)) + } catch { + case scala.util.control.NonFatal(e) => + // The kernel-read path has no Comet-side physicalisation fallback; if kernel can't + // build the read schemas, decline to native and let Spark's reader handle it. + import org.apache.comet.CometSparkSessionExtensions.withFallbackReason + withFallbackReason( + scan, + s"Native Delta scan could not build kernel read schemas for $tableRoot: $e") + return None + } + } + } + if (!kernelSchemaSource.getPhysicalSchema.isEmpty) { + commonBuilder.setKernelPhysicalSchema(kernelSchemaSource.getPhysicalSchema) + commonBuilder.setKernelLogicalSchema(kernelSchemaSource.getLogicalSchema) + } + commonBuilder.setUseFieldId(useFieldIdActive) + commonBuilder.setEmitRowIndex(emitRowIndex) + commonBuilder.setEmitIsRowDeleted(emitIsRowDeleted) + commonBuilder.setEmitRowId(emitRowId) + commonBuilder.setEmitRowCommitVersion(emitRowCommitVersion) + if (rowIndexColumnAlias.nonEmpty) { + commonBuilder.setRowIndexColumnAlias(rowIndexColumnAlias) + } + // Add the `_metadata.*` virtual column names we will synthesise natively (computed + // above as `metadataColumnNamesEmitted` from `scan.requiredSchema`). + metadataColumnNamesEmitted.foreach(commonBuilder.addMetadataColumnNames) + commonBuilder.addAllFinalOutputIndices( + finalOutputIndices.map(i => Integer.valueOf(i)).asJava) + + + + // Kernel-read is the only Delta read path: every file is read through delta-kernel-rs (read + + // transform + DV) by DeltaKernelScanExec, which produces all output columns in-worker, by name. + // The native side splits required_schema into data (read from parquet) + partition (injected) + // columns; column mapping (incl. nested, #47), partitions, row-tracking, _metadata columns, and + // zero-data-column reads (partition-only, e.g. groupBy(partition).agg(count("*")); the exec + // drives the row count from record_count / the parquet footer, #48) are all handled here. + commonBuilder.setSynthesizeInWorker(synthesizeInWorker) + // Delta's test mode prepends `spark.databricks.delta.testOnly.dvFileNamePrefix` to DV + // filenames; delta-kernel-rs doesn't honour that JVM-only conf, so the executor splices it + // back into kernel's resolved DV path. Empty in production (no-op). + commonBuilder.setDvFileNamePrefix(DeltaReflection.dvFileNamePrefix(scan.conf)) + + // (Data-filter pushdown belonged to the removed ParquetSource path; the kernel-read path does + // its own stats-based file pruning during log replay, so no pushed predicate is shipped.) + + storageOptions.asScala.foreach { case (key, value) => + commonBuilder.putObjectStoreOptions(key, value) + } + + // (Column mapping is fully resolved by kernel: the executor reads with the shipped + // `kernel_physical_schema` / `kernel_logical_schema`, so no `column_mappings` tree is sent.) + + // --- 3. Pack into a DeltaScan with COMMON ONLY (split-mode, Phase 5). + // Tasks are NOT included in the proto at planning time. They'll be + // serialized per-partition in CometDeltaNativeScanExec.serializedPartitionData + // at execution time, and merged via DeltaPlanDataInjector. + val deltaScanBuilder = DeltaScan.newBuilder() + deltaScanBuilder.setCommon(commonBuilder.build()) + // table_root is also threaded into each per-partition DeltaScan in + // CometDeltaNativeScanExec.packTasks; set it here as well so the planning-time + // proto carries it for any consumer that reads the parent DeltaScan directly. + val plannedTableRoot = taskList.getTableRoot + if (plannedTableRoot != null && plannedTableRoot.nonEmpty) { + deltaScanBuilder.setTableRoot(plannedTableRoot) + } + // No addAllTasks: tasks stay in taskListBytes for the exec's lazy split. + + // Stash the full task-list bytes for createExec to retrieve. The ThreadLocal + // bridges the convert() -> createExec() gap in CometExecRule.convertToComet. + // Build a modified taskList with ONLY the filtered tasks (partition-pruned). + val filteredTaskList = DeltaScanTaskList + .newBuilder() + .setSnapshotVersion(taskList.getSnapshotVersion) + .setTableRoot(taskList.getTableRoot) + .addAllTasks(filteredTasks.asJava) + .addAllUnsupportedFeatures(taskList.getUnsupportedFeaturesList) + .build() + lastTaskListBytes.set(filteredTaskList.toByteArray) + + // Use the typed DeltaScan proto variant. Core's planner dispatches via the + // OpStruct::DeltaScan match arm under `#[cfg(feature = "contrib-delta")]`. + builder.clearChildren() + Some(builder.setDeltaScan(deltaScanBuilder.build()).build()) + } + + /** + * Serialize the scan's supported data filters into a single predicate proto for kernel's + * stats-based file pruning during log replay. + * + * All supported filters are combined into one AND conjunction. `BoundReference`s carry the + * column INDEX into `scan.output`; the native side resolves indices to column names via the + * `columnNames` array passed alongside. Returns an empty array when no filter serializes. + */ + private def serializeSupportedDataFilters(scan: CometDeltaScanMarker): Array[Byte] = { + val protoFilters = new ListBuffer[Expr]() + // Kernel's stats-based file pruning evaluates the predicate against DATA-column statistics during + // log replay, so it can only reference real data columns. Exclude any filter that touches: + // - a SYNTHETIC column (`__delta_internal_is_row_deleted`, `_tmp_metadata_row_index`, + // `_metadata.*`, ...): not a table column at all -- kernel errors "Predicate references + // unknown column". These are Spark-level filters applied ABOVE the scan, never file-pruning + // predicates. + // - a PARTITION column: partition pruning is done separately in `prunePartitions`; pushing a + // partition predicate into the data-stats predicate also hits kernel's stricter type checks + // (e.g. a generated partition column compared against a literal -> "Timestamp < Int64"). + // Dropping an unpushable filter only forgoes data skipping for it (Spark still applies it); it + // never affects correctness. + val partitionNamesLc = + scan.relation.partitionSchema.fields.map(_.name.toLowerCase(Locale.ROOT)).toSet + def kernelPushable(filter: org.apache.spark.sql.catalyst.expressions.Expression): Boolean = + filter.references.forall { a => + val lc = a.name.toLowerCase(Locale.ROOT) + !SyntheticReadFieldNames.contains(lc) && !partitionNamesLc.contains(lc) + } + scan.supportedDataFilters.filter(kernelPushable).foreach { filter => + exprToProto(filter, scan.output) match { + case Some(proto) => protoFilters += proto + case _ => + } + } + if (protoFilters.isEmpty) { + Array.emptyByteArray + } else if (protoFilters.size == 1) { + protoFilters.head.toByteArray + } else { + // Combine filters into a balanced AND tree (depth O(log N) instead of + // O(N)). A linear left-deep fold overflows protobuf's default 100-level + // recursion limit for plans with many ANDed conditions (Delta data + // skipping predicates routinely build deep stats expressions: e.g. + // DataSkippingDeltaTests "remove redundant stats column references"). + // Both the JVM serde (CometNativeColumnarToRowExec re-parses the plan + // for explain output) and the Rust prost decoder are subject to that + // limit, so balancing the tree fixes both sides. + def balancedAnd(slice: IndexedSeq[Expr]): Expr = { + if (slice.size == 1) { + slice.head + } else { + val mid = slice.size / 2 + val left = balancedAnd(slice.slice(0, mid)) + val right = balancedAnd(slice.slice(mid, slice.size)) + val and = ExprOuterClass.BinaryExpr + .newBuilder() + .setLeft(left) + .setRight(right) + .build() + Expr.newBuilder().setAnd(and).build() + } + } + balancedAnd(protoFilters.toIndexedSeq).toByteArray + } + } + + /** + * Resolve the cloud storage options handed to kernel's `DefaultEngine` and the native parquet + * reader. Kernel picks up `aws_*` / `azure_*` keys; anything else is ignored on the native side + * (for now). + * + * We key off the table root URI rather than `inputFiles.head` because data file names can + * contain characters that aren't URI-safe when Spark's test harness injects prefixes like + * `test%file%prefix-` (breaks `java.net.URI.create`). The table root string comes straight from + * `HadoopFsRelation.location.rootPaths.head.toUri` inside `DeltaReflection.extractTableRoot`, so + * it's already properly encoded. Storage options are bucket-level anyway -- any file under the + * same root resolves to the same config. + * + * For s3/s3a tables we resolve Hadoop's credential provider chain here so log replay + * authenticates under SimpleAWSCredentialsProvider / TemporaryAWSCredentialsProvider / + * AssumedRoleCredentialProvider / IAMInstanceCredentialsProvider just like the data path does. + * The contrib's native engine (delta-kernel-rs's DefaultEngine backed by object_store_kernel) + * doesn't run core's `build_credential_provider`, so we feed it resolved static keys instead. + * SNAPSHOT resolution: log replay completes in seconds, well within any reasonable credential + * TTL. + */ + private def resolveStorageOptions( + scan: CometDeltaScanMarker, + tableRoot: String): java.util.Map[String, String] = { + val relation = scan.relation + val hadoopConf = + relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options) + val tableRootUri = java.net.URI.create(tableRoot) + val baseOptions: Map[String, String] = + NativeConfig.extractObjectStoreOptions(hadoopConf, tableRootUri) + CometDeltaNativeScan + .augmentWithResolvedAwsCredentials(baseOptions, tableRootUri, hadoopConf) + .asJava + } + + /** + * Compute the `final_output_indices` reorder map: for each `required_schema` position, the + * index into the wrapped exec's output (parquet output columns followed by appended synthetics + * in canonical emit order). The native dispatcher applies a final ProjectionExec to reorder + * columns to match Spark's expected layout. Returns `Seq.empty` when no reorder is needed -- + * either no synthetics are emitted, or they already form a correctly-ordered contiguous suffix + * of `required_schema`. + */ + private def computeFinalOutputIndices( + needsSyntheticEmit: Boolean, + requiredSchemaForProto: Array[StructField], + isSynthetic: StructField => Boolean, + emitRowIndex: Boolean, + emitIsRowDeleted: Boolean, + emitRowId: Boolean, + emitRowCommitVersion: Boolean, + rowIndexColumnAlias: String, + metadataColumnNamesEmitted: Seq[String]): Seq[Int] = if (!needsSyntheticEmit) Seq.empty + else { + val firstSyntheticIdx = requiredSchemaForProto.indexWhere(isSynthetic) + val syntheticContiguousSuffix = firstSyntheticIdx >= 0 && + requiredSchemaForProto.drop(firstSyntheticIdx).forall(isSynthetic) + // Synthetic suffix is necessary but NOT sufficient: the order of synthetics + // within the suffix must also match the canonical emission order + // (row_index, is_row_deleted, row_id, row_commit_version, then metadata names + // in `metadataColumnNamesEmitted` order). When the upstream Filter / Project + // binds attributes by ordinal (Delta's PreprocessTableWithDVs adds + // `Filter(__delta_internal_is_row_deleted = 0)` directly above the scan), + // an order mismatch silently misreads one synthetic as another. Force a + // reorder Projection in that case. + val canonicalSyntheticEmitOrder: Seq[String] = (Seq( + (emitRowIndex, + (if (rowIndexColumnAlias.nonEmpty) rowIndexColumnAlias + else DeltaReflection.RowIndexColumnName).toLowerCase(Locale.ROOT)), + (emitIsRowDeleted, + DeltaReflection.IsRowDeletedColumnName.toLowerCase(Locale.ROOT)), + (emitRowId, DeltaReflection.RowIdColumnName.toLowerCase(Locale.ROOT)), + (emitRowCommitVersion, + DeltaReflection.RowCommitVersionColumnName.toLowerCase(Locale.ROOT))).collect { + case (true, name) => name + }) ++ metadataColumnNamesEmitted + val suffixSyntheticNames = requiredSchemaForProto + .drop(firstSyntheticIdx) + .map(_.name.toLowerCase(Locale.ROOT)) + .toSeq + val syntheticSuffixOrderMatches = + syntheticContiguousSuffix && suffixSyntheticNames == canonicalSyntheticEmitOrder + if (syntheticSuffixOrderMatches) Seq.empty + else { + // Native synthetic emit order in build_output_schema (synthetic_columns.rs): + // row_index, is_row_deleted, row_id, row_commit_version, then any + // metadata_column_names in the order they were added. Use the row_index + // ALIAS name when set (e.g. `_tmp_metadata_row_index`) so emit-order + // lookup matches what's in required_schema. + val rowIndexEmittedName = + if (rowIndexColumnAlias.nonEmpty) rowIndexColumnAlias + else DeltaReflection.RowIndexColumnName + val syntheticEmitOrder: Seq[String] = (Seq( + (emitRowIndex, rowIndexEmittedName), + (emitIsRowDeleted, DeltaReflection.IsRowDeletedColumnName), + (emitRowId, DeltaReflection.RowIdColumnName), + (emitRowCommitVersion, DeltaReflection.RowCommitVersionColumnName)).collect { + case (true, name) => name.toLowerCase(Locale.ROOT) + }) ++ metadataColumnNamesEmitted + val nonSyntheticFields = requiredSchemaForProto.filterNot(isSynthetic) + val nonSyntheticIdxByName: Map[String, Int] = + nonSyntheticFields.zipWithIndex.map { case (f, i) => + f.name.toLowerCase(Locale.ROOT) -> i + }.toMap + val syntheticTailStart = nonSyntheticFields.length + requiredSchemaForProto.map { f => + val name = f.name.toLowerCase(Locale.ROOT) + if (isSynthetic(f)) { + val emitIdx = syntheticEmitOrder.indexOf(name) + // emit flags are derived from the same scan.requiredSchema field names + // (lines above), so any synthetic field here must have its corresponding + // emit flag on -- a mismatch would indicate a user column collided with a + // reserved synthetic name AND we missed it. + assert( + emitIdx >= 0, + s"synthetic column '$name' in required_schema but no emit flag is set " + + s"(emit order: $syntheticEmitOrder)") + syntheticTailStart + emitIdx + } else { + nonSyntheticIdxByName(name) + } + }.toSeq + } + } + + + /** + * Compute Spark's `maxSplitBytes` for a Delta scan. Mirrors + * `org.apache.spark.sql.execution.datasources.FilePartition.maxSplitBytes` verbatim so a + * Delta-native scan splits files the same way a vanilla `FileSourceScanExec` would. Inputs are + * file sizes (bytes); other knobs come from session conf and the relation's spark session. + */ + private def maxSplitBytes(scan: CometDeltaScanMarker, fileSizes: Seq[Long]): Long = { + val sparkSession = scan.relation.sparkSession + val conf = sparkSession.sessionState.conf + val openCostInBytes = conf.filesOpenCostInBytes + val maxPartitionBytes = conf.filesMaxPartitionBytes + val minPartitionNum = conf.filesMinPartitionNum + .getOrElse(sparkSession.sparkContext.defaultParallelism) + val totalBytes = fileSizes.map(_ + openCostInBytes).sum + val bytesPerCore = totalBytes / math.max(1, minPartitionNum) + math.min(maxPartitionBytes, math.max(openCostInBytes, bytesPerCore)) + } + + /** + * Expand `tasks` so any task whose file is larger than `maxSplitBytes` is replaced by a + * sequence of byte-range chunks. Each chunk inherits the task's metadata (partition values, DV + * row indexes, row-tracking ids) but carries `byte_range_start` / `byte_range_end` so the + * native parquet reader only materialises row groups whose start offset falls in this range. + * + * Tasks that fit in one chunk are emitted unchanged (no range fields), which preserves the + * original whole-file semantics on the native side. + * + * Note on DV semantics: deletion-vector indexes on the proto are absolute row positions within + * the file. They are copied to every chunk; the native scan filters out rows whose absolute + * index is in the DV regardless of which chunk produced them, so duplicating the index list + * across chunks is correct (just slightly wasteful). + */ + private def splitTasks( + scan: CometDeltaScanMarker, + tasks: Seq[OperatorOuterClass.DeltaScanTask]): Seq[OperatorOuterClass.DeltaScanTask] = { + if (tasks.isEmpty) return tasks + // When the scan needs one task per partition (per-file `_metadata.file_path`), keep each + // task 1:1 with a file: byte-range chunking would create multiple tasks for one file which, + // combined with packTasks below, could end up with multiple FILES per partition and drop + // the 2nd+ files' rows. + if (scanNeedsOneTaskPerPartition(scan)) return tasks + val sizes = tasks.map(_.getFileSize) + val msb = maxSplitBytes(scan, sizes) + if (msb <= 0) return tasks + tasks.flatMap { task => + val size = task.getFileSize + if (size <= msb) Seq(task) + else { + val chunks = scala.collection.mutable.ArrayBuffer[OperatorOuterClass.DeltaScanTask]() + var offset = 0L + while (offset < size) { + val end = math.min(offset + msb, size) + chunks += task.toBuilder + .setByteRangeStart(offset) + .setByteRangeEnd(end) + .build() + offset = end + } + chunks.toSeq + } + } + } + + private def prunePartitions( + tasks: Seq[OperatorOuterClass.DeltaScanTask], + scan: CometDeltaScanMarker, + partitionSchema: StructType): Seq[OperatorOuterClass.DeltaScanTask] = { + if (scan.partitionFilters.isEmpty || partitionSchema.isEmpty) return tasks + + // Phase 5b: filter out DPP expressions (DynamicPruningExpression wrapping + // InSubqueryExec) because they aren't resolved at planning time. Spark + // applies them post-scan at runtime. Static partition filters are still + // evaluated here for file-level pruning. + val staticFilters = scan.partitionFilters.filterNot( + _.exists(_.isInstanceOf[org.apache.spark.sql.catalyst.expressions.PlanExpression[_]])) + if (staticFilters.isEmpty) return tasks + + // Build an `InterpretedPredicate` that expects a row whose schema matches + // `partitionSchema`. Rewrite attribute references to `BoundReference`s keyed by + // partition-schema field index, respecting case sensitivity. + val caseSensitive = scan.conf.getConf[Boolean](SQLConf.CASE_SENSITIVE) + val combined = staticFilters.reduce(And) + val bound = combined.transform { + case a: org.apache.spark.sql.catalyst.expressions.AttributeReference => + val idx = if (caseSensitive) { + partitionSchema.fieldIndex(a.name) + } else { + partitionSchema.fields.indexWhere( + _.name.toLowerCase(Locale.ROOT) == a.name.toLowerCase(Locale.ROOT)) + } + if (idx < 0) return tasks // Can't resolve; skip pruning + BoundReference(idx, partitionSchema(idx).dataType, partitionSchema(idx).nullable) + } + val predicate = InterpretedPredicate(bound) + predicate.initialize(0) + + val sessionZoneId = java.time.ZoneId.of(scan.conf.sessionLocalTimeZone) + tasks.filter { task => + val row = InternalRow.fromSeq(partitionSchema.fields.toSeq.map { field => + val proto = task.getPartitionValuesList.asScala.find(_.getName == field.name) + val strValue = + if (proto.exists(_.hasValue)) Some(proto.get.getValue) else None + DeltaReflection.castPartitionString(strValue, field.dataType, sessionZoneId) + }) + predicate.eval(row) + } + } + + + def createExec(nativeOp: Operator, op: CometDeltaScanMarker): CometNativeExec = { + val tableRoot = DeltaReflection.extractTableRoot(op.relation).getOrElse("unknown") + val tlBytes = + try { + Option(lastTaskListBytes.get()).getOrElse(Array.emptyByteArray) + } finally { + lastTaskListBytes.remove() + } + // Force one file per Spark partition when the scan reads MATERIALISED row-tracking + // columns (`_row-id-col-*` / `_row-commit-version-col-*`). These are real parquet + // columns present only in files rewritten by a row-id-preserving operation + // (OPTIMIZE/UPDATE/MERGE) -- and ABSENT from freshly-appended/inserted files. When a + // single Spark partition packs several such files, `core_glue` emits one parquet + // file-group per file (needed for per-file row_index); reading a column that is + // physically absent from some of those files across the concurrently-executed + // file-groups non-deterministically drops whole file-groups' rows. Pinning one file + // per partition keeps each native plan single-file-group, so the absent-column + // null-fill happens without cross-file-group concurrency. (Same mechanism used for + // per-file `_metadata.file_path`.) See CometDeltaRowTrackingMergeReproSuite. + val readsMaterializedRowTracking = + op.requiredSchema.fields.exists(f => + CometDeltaNativeScan.isMaterializedRowTrackingName(f.name)) + val oneTaskPerPartition = scanNeedsOneTaskPerPartition(op) || readsMaterializedRowTracking || + CometDeltaNativeScan.needsPerFileGroups(op) + + val dppFilters = op.partitionFilters.filter( + _.exists(_.isInstanceOf[org.apache.spark.sql.catalyst.expressions.PlanExpression[_]])) + val partitionSchema = op.relation.partitionSchema + + val exec = CometDeltaNativeScanExec( + nativeOp, + op.output, + org.apache.spark.sql.comet.SerializedPlan(None), + op.wrapped, + tableRoot, + tlBytes, + dppFilters, + partitionSchema, + oneTaskPerPartition = oneTaskPerPartition) + // `op.wrapped` (== exec.originalPlan) is the original, link-bearing scan (preserved through + // DeltaScanRule's rebuild), so CometExecRule's "set up logical links" pass -- which keys off + // originalPlan.logicalLink -- finds it and sets the exec's link, satisfying AQE's + // setLogicalLinkForNewQueryStage assertion. Set it here too for good measure. + op.wrapped.logicalLink.foreach(exec.setLogicalLink) + exec + } +} diff --git a/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/Native.scala b/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/Native.scala new file mode 100644 index 0000000000..92fc644a38 --- /dev/null +++ b/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/Native.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.delta + +import org.apache.comet.NativeBase + +/** + * Contrib-local JVM handle to the Delta-specific native entry point. + * + * Extends `NativeBase` so the libcomet load triggers on first use of any subclass -- the contrib + * doesn't reload the library itself (there is exactly one libcomet at runtime), but inheriting + * from `NativeBase` ensures the static initializer ordering works the same way as core's + * `org.apache.comet.Native`. The `@native` method below binds to + * `Java_org_apache_comet_contrib_delta_Native_planDeltaScan` exported by the contrib's Rust crate + * (compiled INTO libcomet via the `contrib-delta` Cargo feature on `native/core`). + */ +class Native extends NativeBase { + + /** + * Driver-side Delta log replay. Returns a prost-encoded `DeltaScanTaskList` proto (raw bytes) + * which the caller decodes via `DeltaScanTaskList.parseFrom(...)`. + * + * @param tableUrl + * absolute URL or bare path of the Delta table root + * @param snapshotVersion + * `-1` for the latest snapshot, otherwise an exact version + * @param storageOptions + * cloud credentials / endpoint overrides (Hadoop-style keys) + * @param predicateBytes + * prost-encoded Catalyst data filter for kernel-side stats-based file pruning, or an empty + * array for no predicate + * @param columnNames + * logical column names the caller requires (kernel uses this for column-mapping resolution + * before stats-based file pruning). + * @param projectedSchemaIpc + * the query's data-read columns in pure-logical names at every nesting level (Spark + * `requiredSchema` minus partition + synthetic columns), serialized as an Arrow IPC schema + * message (`Schema.serializeAsMessage()`). Drives `scan.with_schema(...)` so the returned + * `DeltaScanTaskList` carries kernel's projected `physical_schema` / `logical_schema`. Empty + * array for no projection (full-table scan; no kernel schemas returned). + * @return + * `byte[]` containing the encoded DeltaScanTaskList + */ + @native def planDeltaScan( + tableUrl: String, + snapshotVersion: Long, + storageOptions: java.util.Map[String, String], + predicateBytes: Array[Byte], + columnNames: Array[String], + projectedSchemaJson: String): Array[Byte] + + /** + * Schema-only companion to [[planDeltaScan]] for the batch-file-index read path (file list comes + * from Delta `AddFile`s, but the kernel-read executor still needs kernel's resolved + * physical/logical schemas). Returns a `DeltaScanTaskList` with only `physical_schema` / + * `logical_schema` set (Arrow IPC). `projectedSchemaJson` is the data-read schema as Delta schema + * JSON (`StructType.json`, carrying column-mapping physicalName/id from the analysis-time or + * snapshot schema); empty string => zero data columns, no schemas returned. + */ + @native def planDeltaReadSchemas( + tableUrl: String, + snapshotVersion: Long, + storageOptions: java.util.Map[String, String], + projectedSchemaJson: String): Array[Byte] +} diff --git a/contrib/delta/src/main/scala/org/apache/spark/sql/comet/CometDeltaNativeScanExec.scala b/contrib/delta/src/main/scala/org/apache/spark/sql/comet/CometDeltaNativeScanExec.scala new file mode 100644 index 0000000000..c5dd2a5280 --- /dev/null +++ b/contrib/delta/src/main/scala/org/apache/spark/sql/comet/CometDeltaNativeScanExec.scala @@ -0,0 +1,554 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import java.util.Locale + +import scala.jdk.CollectionConverters._ + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} +import org.apache.spark.sql.execution.{FileSourceScanExec, InSubqueryExec, SparkPlan} +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.AccumulatorV2 + +import com.google.common.base.Objects + +import org.apache.comet.serde.OperatorOuterClass +import org.apache.comet.serde.OperatorOuterClass.Operator + +/** + * Native Delta Lake scan operator with split-mode serialization and DPP support. + * + * Common scan metadata (schemas, filters, projections, storage options, column mappings) is + * serialized once at planning time in `nativeOp`. Per-partition file lists are materialized + * lazily in `serializedPartitionData` at execution time so each Spark task receives only its own + * slice of the file list, reducing driver memory. + * + * DPP (Dynamic Partition Pruning) is supported by deferring partition pruning of DPP expressions + * to execution time. Static partition filters are applied at planning time in + * `CometDeltaNativeScan.prunePartitions`; DPP filters are resolved in `serializedPartitionData`. + */ +case class CometDeltaNativeScanExec( + override val nativeOp: Operator, + override val output: Seq[Attribute], + override val serializedPlanOpt: SerializedPlan, + @transient originalPlan: FileSourceScanExec, + tableRoot: String, + @transient taskListBytes: Array[Byte], + @transient dppFilters: Seq[Expression] = Seq.empty, + partitionSchema: StructType = new StructType(), + /** + * When true, `packTasks` emits one group (= one partition) per task so the native plan's + * per-file file-groups stay 1:1 with Spark partitions (Spark consumes a single DataFusion + * partition per Spark partition, so multiple files in one partition would drop the 2nd+ + * files' rows). Set by `CometDeltaNativeScan.createExec` when the scan projects per-file + * `_metadata.file_path`, reads materialized row-tracking columns, or otherwise needs + * per-file groups. + */ + oneTaskPerPartition: Boolean = false) + extends CometLeafExec + with org.apache.spark.sql.comet.CometScanWithPlanData { + + override val supportsColumnar: Boolean = true + + override val nodeName: String = s"CometDeltaNativeScan $tableRoot" + + // DPP support. The AQE DPP subquery on a partitioned Delta scan arrives as an + // unexecutable placeholder: CometExecRule wraps Spark's + // SubqueryAdaptiveBroadcastExec into CometSubqueryAdaptiveBroadcastExec, and + // CometPlanAdaptiveDynamicPruningFilters rewrites it to an executable + // (Comet)SubqueryBroadcastExec with proper broadcast reuse. That rewrite would + // normally produce a copy of this scan, but the copy is dropped when the + // enclosing native block is rebuilt (TreeNode.makeCopy can't carry @transient + // fields, #3510). So the rule installs the rewrite IN PLACE via + // `withDynamicPruningFilters` (below), which updates this transient + // side-channel and returns `this` -- landing the executable subqueries on the + // SAME instance that executes. `dppFilters` (the case-class field) is left + // untouched so node equality/canonicalization is unaffected; everything at + // execution reads `effectiveDppFilters`. + // `@volatile`: set during query-stage optimization and read during execution + // (driver-thread-confined in practice, but volatile guards against AQE re-planning + // on a different thread). + @transient @volatile private var dppFiltersOverride: Seq[Expression] = null + + private def effectiveDppFilters: Seq[Expression] = + if (dppFiltersOverride != null) dppFiltersOverride else dppFilters + + override def dynamicPruningFilters: Seq[Expression] = effectiveDppFilters + + override def withDynamicPruningFilters(filters: Seq[Expression]): SparkPlan = { + dppFiltersOverride = filters + this + } + + /** True when a DPP subquery is an adaptive-broadcast placeholder we can't + * execute: the unwrapped Spark `SubqueryAdaptiveBroadcastExec` or the + * Comet-wrapped `CometSubqueryAdaptiveBroadcastExec`. Both throw from + * `doExecute()`. Normally the rule rewrites them in place (see above) before + * execution; this guard skips any that slip through (e.g. the rule didn't + * run) so we read all partitions instead of crashing. */ + private def isUnexecutableDpp(plan: SparkPlan): Boolean = + plan.isInstanceOf[org.apache.spark.sql.execution.SubqueryAdaptiveBroadcastExec] || + plan.isInstanceOf[CometSubqueryAdaptiveBroadcastExec] + + override protected def doPrepare(): Unit = { + // `prepare()` (not execute) is safe for any subquery plan, including a + // placeholder. + effectiveDppFilters.foreach { + case DynamicPruningExpression(e: InSubqueryExec) => + e.plan.prepare() + case _ => + } + super.doPrepare() + } + + // Resolve only the DPP subqueries we can execute; skip adaptive-broadcast + // PLACEHOLDERS (CometSubqueryAdaptiveBroadcastExec / SubqueryAdaptiveBroadcastExec), + // which throw from doExecute(). When the optimizer rule's in-place rewrite reached + // this instance, `effectiveDppFilters` holds the executable form and pruning applies; + // otherwise the placeholder is skipped and the scan reads all partitions (correct, the + // surrounding Filter/join still prunes). `applyDppFilters` enforces the same skip. + private def resolveExecutableDppSubqueries(): Unit = { + effectiveDppFilters.foreach { + case DynamicPruningExpression(inSub: InSubqueryExec) + if !isUnexecutableDpp(inSub.plan) && inSub.values().isEmpty => + inSub.updateResult() + case _ => + } + } + + // Comet's native-scan subquery lifecycle (see CometLeafExec): used when this scan is + // fused inside a parent native block (findAllPlanData path). + override def ensureSubqueriesResolved(): Unit = { + prepare() + resolveExecutableDppSubqueries() + } + + // Standard Spark lifecycle path (executeColumnar -> executeQuery -> waitForSubqueries), + // used when this scan is a native-block ROOT executed directly (e.g. the child of a + // CometNativeColumnarToRowExec, as in a MERGE target read). The default would execute + // EVERY collected subquery -- including an unconverted CometSubqueryAdaptiveBroadcastExec + // (the in-place DPP rewrite is lost whenever the plan is copied after the rule runs, + // since `dppFiltersOverride` is not a constructor field) -- and crash. Override to + // resolve only the executable ones, mirroring `ensureSubqueriesResolved`. The native + // scan has no subqueries other than its DPP partition filters, so not delegating to + // `super` is safe. + override def waitForSubqueries(): Unit = resolveExecutableDppSubqueries() + + @transient private lazy val commonBytes: Array[Byte] = { + // The typed DeltaScan variant of OpStruct carries the common block directly. + nativeOp.getDeltaScan.getCommon.toByteArray + } + + @transient private lazy val allTasks: Seq[OperatorOuterClass.DeltaScanTask] = + OperatorOuterClass.DeltaScanTaskList + .parseFrom(taskListBytes) + .getTasksList + .asScala + .toSeq + + /** + * Synthesise a `Seq[FilePartition]` from this scan's tasks, with each task becoming one + * `PartitionedFile` carrying its partition values as an `InternalRow`. Delta tests (e.g. + * `DeltaSinkSuite`) inspect `executedPlan.collect[DataSourceScanExec]` and read + * `inputRDDs.head.asInstanceOf[FileScanRDD].filePartitions` to verify partition pruning; those + * tests find nothing under Comet because we replace the scan with this exec. The test diff in + * `dev/diffs/delta/.diff` patches the helper to fall back to this accessor, so the + * same partition-pruning assertions pass against Comet's scan. + */ + def synthesizedFilePartitions: Seq[org.apache.spark.sql.execution.datasources.FilePartition] = { + if (allTasks.isEmpty) return Nil + val sessionTz = java.time.ZoneId.of(SQLConf.get.sessionLocalTimeZone) + val files = allTasks.zipWithIndex.map { case (task, _) => + val pvRow = InternalRow.fromSeq(partitionSchema.fields.toSeq.map { f => + val proto = task.getPartitionValuesList.asScala.find(_.getName == f.name) + val s = if (proto.exists(_.hasValue)) Some(proto.get.getValue) else None + org.apache.comet.contrib.delta.DeltaReflection + .castPartitionString(s, f.dataType, sessionTz) + }) + val sparkPath = + org.apache.spark.paths.SparkPath.fromUrlString(task.getFilePath) + org.apache.spark.sql.execution.datasources.PartitionedFile( + partitionValues = pvRow, + filePath = sparkPath, + start = if (task.hasByteRangeStart) task.getByteRangeStart else 0L, + length = { + if (task.hasByteRangeStart && task.hasByteRangeEnd) { + task.getByteRangeEnd - task.getByteRangeStart + } else task.getFileSize + }, + modificationTime = 0L, + fileSize = task.getFileSize) + } + files.zipWithIndex.map { case (pf, i) => + org.apache.spark.sql.execution.datasources.FilePartition(i, Array(pf)) + } + } + + /** + * Build per-partition bytes from the current DPP-pruned task list. DPP filters that are still + * `SubqueryAdaptiveBroadcastExec` placeholders at planning time materialise lazily once AQE + * runs the broadcast; by recomputing this at `doExecuteColumnar` (rather than memoising the + * result in a lazy val) we pick up the resolved values and actually skip partitions, instead of + * reading the full table every time AQE is in the loop. + */ + private def buildPerPartitionBytes(): Array[Array[Byte]] = { + // Group ALL tasks once (`taskGroups`) so the partition COUNT is fixed + // regardless of DPP -- Spark pins `numPartitions` at planning and the native + // RDD's partition count must not change at execution. DPP pruning then + // happens WITHIN each group: pruned-out tasks are removed, and a group whose + // tasks are all pruned becomes an empty DeltaScan (0 rows) -- but the group + // (= partition slot) remains, keeping the count stable. This lets DPP prune + // even when the scan executes inside a parent native block (MERGE/join), + // where the parent reads `perPartitionData` rather than running the scan's + // own `doExecuteColumnar`. + val groups = taskGroups + if (groups.isEmpty) return Array.empty[Array[Byte]] + // Gate on `effectiveDppFilters` (the rule's in-place rewrite), not the raw + // `dppFilters`, so pruning uses the executable converted form when present. + val survivorPaths: Option[Set[String]] = + if (effectiveDppFilters.nonEmpty && partitionSchema.nonEmpty) { + Some(applyDppFilters(allTasks).map(_.getFilePath).toSet) + } else None + groups.map { group => + val kept = survivorPaths match { + case Some(s) => group.filter(t => s.contains(t.getFilePath)) + case None => group + } + val builder = OperatorOuterClass.DeltaScan.newBuilder() + // Thread the table root through to the executor; required by the executor-side + // DV decoder (kernel `absolute_path` joins `_delta_log/deletion_vectors/...` onto + // this) and harmless to set even when no task in this partition has a DV. + if (tableRoot != null && tableRoot.nonEmpty) builder.setTableRoot(tableRoot) + kept.foreach(builder.addTasks) + builder.build().toByteArray + }.toArray + } + + // When `oneTaskPerPartition` is set (per-file `_metadata.file_path` / materialized + // row-tracking / per-file groups), short-circuit packing so each task gets its own + // partition, keeping the native plan's per-file file-groups 1:1 with Spark partitions. + private def packTasks( + tasks: Seq[OperatorOuterClass.DeltaScanTask]): Seq[Seq[OperatorOuterClass.DeltaScanTask]] = { + if (oneTaskPerPartition) return tasks.map(t => Seq(t)) + val conf = originalPlan.relation.sparkSession.sessionState.conf + val openCostInBytes = conf.filesOpenCostInBytes + val maxPartitionBytes = conf.filesMaxPartitionBytes + val minPartitionNum = conf.filesMinPartitionNum + .getOrElse(originalPlan.relation.sparkSession.sparkContext.defaultParallelism) + def taskSize(t: OperatorOuterClass.DeltaScanTask): Long = { + if (t.hasByteRangeStart && t.hasByteRangeEnd) { + math.max(0L, t.getByteRangeEnd - t.getByteRangeStart) + } else t.getFileSize + } + val totalBytes = tasks.map(t => taskSize(t) + openCostInBytes).sum + val bytesPerCore = totalBytes / math.max(1, minPartitionNum) + val msb = math.min(maxPartitionBytes, math.max(openCostInBytes, bytesPerCore)) + val out = scala.collection.mutable.ArrayBuffer[Seq[OperatorOuterClass.DeltaScanTask]]() + val current = scala.collection.mutable.ArrayBuffer[OperatorOuterClass.DeltaScanTask]() + var currentSize = 0L + tasks.foreach { task => + val size = taskSize(task) + if (currentSize + size > msb && current.nonEmpty) { + out += current.toList + current.clear() + currentSize = 0L + } + current += task + currentSize += size + openCostInBytes + } + if (current.nonEmpty) out += current.toList + out.toSeq + } + + // Stable task grouping = the partition layout. Computed once from ALL tasks so + // the partition count is fixed across planning and execution (DPP prunes + // tasks WITHIN groups, never changing the group count). `numPartitions` reads + // this directly so counting partitions never triggers DPP broadcast + // resolution. + // + // An empty scan (zero tasks -- e.g. a DELETE that matches nothing, or a DV-maintenance + // read pruned to zero files) still gets ONE empty group: `outputPartitioning` floors the + // partition count to `max(1, numPartitions)`, so the per-partition data MUST also have one + // (empty) entry or `NativeExecContext`'s "all per-partition arrays must have length + // numPartitions" check trips when this scan is fused into a parent native block. + // (Repro: CometDeltaDeleteWithDVReproSuite.) + @transient private lazy val taskGroups: Seq[Seq[OperatorOuterClass.DeltaScanTask]] = + if (allTasks.isEmpty) Seq(Seq.empty) else packTasks(allTasks) + + private def applyDppFilters( + tasks: Seq[OperatorOuterClass.DeltaScanTask]): Seq[OperatorOuterClass.DeltaScanTask] = { + // Resolve each DPP subquery to its runtime pruning values, then prune tasks + // by evaluating the partition predicate below. By execution time the rule + // has installed executable (Comet)SubqueryBroadcastExec subqueries in place + // (see `withDynamicPruningFilters`); we resolve them here. If an + // unexecutable placeholder slipped through (rule didn't run), skip pruning + // and read all tasks (correct, just unpruned) rather than crashing. + if (effectiveDppFilters.exists { + case DynamicPruningExpression(inSub: InSubqueryExec) => isUnexecutableDpp(inSub.plan) + case _ => false + }) { + return tasks + } + val resolvedFilters: Seq[Expression] = + try { + effectiveDppFilters.map { + case DynamicPruningExpression(inSub: InSubqueryExec) => + if (inSub.values().isEmpty) inSub.updateResult() + inSub + case DynamicPruningExpression(e) => e + case other => other + } + } catch { + case scala.util.control.NonFatal(_) => return tasks + } + if (resolvedFilters.isEmpty) return tasks + + val caseSensitive = SQLConf.get.getConf[Boolean](SQLConf.CASE_SENSITIVE) + val combined = resolvedFilters.reduce(And) + val bound = combined.transform { case a: AttributeReference => + val idx = partitionSchema.fields.indexWhere(f => + if (caseSensitive) f.name == a.name + else f.name.toLowerCase(Locale.ROOT) == a.name.toLowerCase(Locale.ROOT)) + if (idx < 0) return tasks + BoundReference(idx, partitionSchema(idx).dataType, partitionSchema(idx).nullable) + } + val predicate = InterpretedPredicate(bound) + predicate.initialize(0) + + val sessionZoneId = java.time.ZoneId.of(SQLConf.get.sessionLocalTimeZone) + tasks.filter { task => + val row = InternalRow.fromSeq(partitionSchema.fields.toSeq.map { field => + val proto = task.getPartitionValuesList.asScala.find(_.getName == field.name) + val strValue = + if (proto.exists(_.hasValue)) Some(proto.get.getValue) else None + org.apache.comet.contrib.delta.DeltaReflection + .castPartitionString(strValue, field.dataType, sessionZoneId) + }) + predicate.eval(row) + } + } + + def commonData: Array[Byte] = commonBytes + // Recomputed (not memoised) so that when a parent native block reads this at + // execution -- after AQE has materialised the DPP broadcast -- the returned + // per-partition task lists reflect DPP pruning. The partition COUNT is fixed + // by `taskGroups`; only the tasks within each group are pruned. + def perPartitionData: Array[Array[Byte]] = buildPerPartitionBytes() + + /** + * Unique key for matching this scan's common/per-partition data to its operator in the native + * plan. Must be distinct across multiple Delta scans in the same plan tree -- e.g. a self-join + * reading two snapshot versions of the same table, where `tableRoot` alone is not unique. + * + * Derived identically in `DeltaPlanDataInjector.getKey` from the serialized `DeltaScanCommon` + * proto so the driver-side map and the executor-side lookup agree. + * + * Mirrors the pattern used by `CometNativeScanExec.sourceKey`. + */ + def sourceKey: String = CometDeltaNativeScanExec.computeSourceKey(nativeOp) + + def numPartitions: Int = taskGroups.length + + override lazy val outputPartitioning: Partitioning = + UnknownPartitioning(math.max(1, numPartitions)) + + override lazy val outputOrdering: Seq[SortOrder] = Nil + + private class ImmutableSQLMetric(metricType: String) extends SQLMetric(metricType, 0) { + override def merge(other: AccumulatorV2[Long, Long]): Unit = {} + override def reset(): Unit = {} + } + + override lazy val metrics: Map[String, SQLMetric] = { + val taskList = + if (taskListBytes != null) { + OperatorOuterClass.DeltaScanTaskList.parseFrom(taskListBytes) + } else { + null + } + + // Key these under both the Comet-native-side name (`output_rows`, used by the metric + // collector on the native side) and the Spark streaming ProgressReporter name + // (`numOutputRows`, read by `extractSourceToNumInputRows` to populate + // `q.recentProgress.numInputRows`). Without the `numOutputRows` alias, streaming + // workloads that this scan feeds report 0 input rows per batch even when data flows + // correctly -- DeltaSourceSuiteBase.CheckProgress then fails with + // "Execute: 0 did not equal N Expected batches don't match". + val outputRowsMetric = SQLMetrics.createMetric(sparkContext, "number of output rows") + val baseMetrics = Map( + "output_rows" -> outputRowsMetric, + "numOutputRows" -> outputRowsMetric, + "num_splits" -> SQLMetrics.createMetric(sparkContext, "number of file splits processed")) + + val planningMetrics = if (taskList != null) { + val totalFiles = new ImmutableSQLMetric("sum") + totalFiles.set(taskList.getTasksCount.toLong) + sparkContext.register(totalFiles, "total files") + + val dvFiles = new ImmutableSQLMetric("sum") + dvFiles.set(taskList.getTasksList.asScala.count(_.hasDv).toLong) + sparkContext.register(dvFiles, "files with deletion vectors") + + // `numFiles` alias mirrors Spark's `FileSourceScanExec` metric name so + // tests like DeltaSuite.scala "query with predicates should skip + // partitions" -- which read `metrics.get("numFiles")` to verify + // partition skipping -- find the same value on Comet's scan exec. + Map("total_files" -> totalFiles, "numFiles" -> totalFiles, "dv_files" -> dvFiles) + } else { + Map.empty[String, SQLMetric] + } + + baseMetrics ++ planningMetrics + } + + override def doExecuteColumnar(): RDD[ColumnarBatch] = { + val nativeMetrics = CometMetricNode.fromCometPlan(this) + val serializedPlan = CometExec.serializeNativePlan(nativeOp) + // Recompute DPP pruning at execution time so we pick up broadcast results AQE has now + // materialised (the lazy `planningPerPartitionBytes` was computed before AQE ran). When DPP + // is absent or was already resolved at planning time, the two arrays are identical. + val execPerPartitionBytes = buildPerPartitionBytes() + // Mirror `CometNativeScanExec`'s encryption wiring: when parquet encryption is + // enabled on the table's hadoop conf, broadcast the conf to executors and + // gather every input file path (so the parquet reader can decrypt per file). + val sparkSession = originalPlan.relation.sparkSession + val hadoopConf = sparkSession.sessionState + .newHadoopConfWithOptions(originalPlan.relation.options) + val (broadcastedHadoopConfForEncryption, encryptedFilePaths) = + if (org.apache.comet.parquet.CometParquetUtils.encryptionEnabled(hadoopConf)) { + val broadcastedConf = sparkSession.sparkContext + .broadcast(new org.apache.spark.util.SerializableConfiguration(hadoopConf)) + val paths = execPerPartitionBytes.flatMap { bytes => + OperatorOuterClass.DeltaScan.parseFrom(bytes).getTasksList.asScala.map(_.getFilePath) + }.toSeq + (Some(broadcastedConf), paths) + } else { + (None, Seq.empty[String]) + } + val baseRDD = CometExecRDD( + sparkContext, + inputRDDs = Seq.empty, + commonByKey = Map(sourceKey -> commonData), + perPartitionByKey = Map(sourceKey -> execPerPartitionBytes), + serializedPlan = serializedPlan, + numPartitions = execPerPartitionBytes.length, + numOutputCols = output.length, + nativeMetrics = nativeMetrics, + subqueries = Seq.empty, + broadcastedHadoopConfForEncryption = broadcastedHadoopConfForEncryption, + encryptedFilePaths = encryptedFilePaths) + + baseRDD + } + + override def convertBlock(): CometDeltaNativeScanExec = { + val newSerializedPlan = if (serializedPlanOpt.isEmpty) { + val bytes = CometExec.serializeNativePlan(nativeOp) + SerializedPlan(Some(bytes)) + } else { + serializedPlanOpt + } + // IMPORTANT: forward `oneTaskPerPartition` to the rebuilt exec. The case + // class has `oneTaskPerPartition: Boolean = false` as the last constructor + // param with a default; if we don't pass it explicitly here, every call to + // `convertBlock()` silently downgrades the flag to false, packing multiple + // files into one partition and dropping the 2nd+ files' rows for scans that + // emit per-file `_metadata.file_path` / materialized row-tracking columns. + CometDeltaNativeScanExec( + nativeOp, + output, + newSerializedPlan, + originalPlan, + tableRoot, + taskListBytes, + dppFilters, + partitionSchema, + oneTaskPerPartition) + } + + override protected def doCanonicalize(): CometDeltaNativeScanExec = { + copy( + output = output.map(QueryPlan.normalizeExpressions(_, output)), + serializedPlanOpt = SerializedPlan(None), + originalPlan = null, + taskListBytes = null, + dppFilters = Seq.empty) + } + + override def stringArgs: Iterator[Any] = { + val taskCount = + if (taskListBytes != null) { + OperatorOuterClass.DeltaScanTaskList.parseFrom(taskListBytes).getTasksCount + } else { + 0 + } + val dppStr = if (dppFilters.nonEmpty) { + s", dpp=${dppFilters.mkString("[", ", ", "]")}" + } else { + "" + } + Iterator(output, s"$tableRoot ($taskCount files$dppStr)") + } + + override def equals(obj: Any): Boolean = obj match { + case other: CometDeltaNativeScanExec => + // Include `sourceKey` so two scans of the same table at different snapshot versions + // are NOT considered equal. Without this, Spark's ReuseExchangeAndSubquery rule + // collapses a self-join across versions into a single exchange and reuses v0's + // shuffle output for both sides of the join. + tableRoot == other.tableRoot && + output == other.output && + serializedPlanOpt == other.serializedPlanOpt && + sourceKey == other.sourceKey + case _ => false + } + + override def hashCode(): Int = + Objects.hashCode(tableRoot, output.asJava, serializedPlanOpt, sourceKey) +} + +object CometDeltaNativeScanExec { + + /** + * Compute a stable, per-scan unique key from a `DeltaScan` operator proto. Must be + * deterministic and identical between the driver side (`CometDeltaNativeScanExec.sourceKey`) + * and the injector side (`DeltaPlanDataInjector.getKey`). + * + * Includes `snapshot_version` so that two scans of the same table at different time-travel + * versions produce distinct keys -- otherwise `findAllPlanData` collapses their per-partition + * data into a single map entry and one scan inherits the other's file list. + */ + def computeSourceKey(nativeOp: Operator): String = { + val common = nativeOp.getDeltaScan.getCommon + val components = Seq( + common.getTableRoot, + common.getSnapshotVersion.toString, + common.getRequiredSchemaList.toString) + s"${common.getSource}_${components.mkString("|").hashCode}" + } +} diff --git a/contrib/delta/src/main/scala/org/apache/spark/sql/comet/DeltaPlanDataInjector.scala b/contrib/delta/src/main/scala/org/apache/spark/sql/comet/DeltaPlanDataInjector.scala new file mode 100644 index 0000000000..ad9bad0316 --- /dev/null +++ b/contrib/delta/src/main/scala/org/apache/spark/sql/comet/DeltaPlanDataInjector.scala @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.spark.sql.comet + +import org.apache.comet.serde.OperatorOuterClass +import org.apache.comet.serde.OperatorOuterClass.Operator + +/** + * `PlanDataInjector` for the typed `OpStruct::DeltaScan` operator. + * + * The contrib serialises the Delta scan in two parts to keep the closure sent to every + * task small: + * - At planning time `CometDeltaNativeScan.convert` emits a `DeltaScan` proto with + * the `common` block (schemas, table root, filters, ...) and NO tasks; this lands + * in the `Operator` tree as the typed variant `OpStruct.delta_scan`. + * - Per partition, `CometDeltaNativeScanExec` puts the partition's `DeltaScan` + * (tasks-only) bytes into `perPartitionByKey` under a `sourceKey` derived from + * the common block. + * + * Core's `PlanDataInjector.injectPlanData` discovers this object via the reflective + * `Class.forName("org.apache.spark.sql.comet.DeltaPlanDataInjector")` lookup added to + * `PlanDataInjector.injectors`; default builds get no DeltaPlanDataInjector class on + * the classpath and the injector list is unchanged. + * + * Without this injection the native side decodes a tasks-empty `DeltaScan` -> `EmptyExec` + * (0 rows) for every Delta scan. + */ +object DeltaPlanDataInjector extends PlanDataInjector { + + override val opStructCase: Operator.OpStructCase = Operator.OpStructCase.DELTA_SCAN + + override def canInject(op: Operator): Boolean = { + if (!op.hasDeltaScan) return false + // The common-only proto produced at planning time has zero tasks. After injection + // the operator carries the partition's tasks -- skip those (idempotent canInject). + // + // Note: a CDF read always has zero tasks (it carries a version sub-range, not files), so this + // stays true even after the CDF branch in `inject` runs. That's intentionally NOT idempotent- + // guarded the way the task branch is, and it's safe because `PlanDataInjector.injectPlanData` + // walks each operator exactly once per partition (CometExecRDD.compute -> one inject per op). + op.getDeltaScan.getTasksCount == 0 + } + + override def getKey(op: Operator): Option[String] = + Some(CometDeltaNativeScanExec.computeSourceKey(op)) + + override def inject( + op: Operator, + commonBytes: Array[Byte], + partitionBytes: Array[Byte]): Operator = { + // `partitionBytes` is the serialised `DeltaScan` that packs only this partition's + // tasks (no common block) to avoid duplicating schemas across partitions. Splice + // the partition's tasks into the original common-only envelope. + val partitionScan = OperatorOuterClass.DeltaScan.parseFrom(partitionBytes) + val originalScan = op.getDeltaScan + val mergedScanBuilder = OperatorOuterClass.DeltaScan + .newBuilder(originalScan) + .addAllTasks(partitionScan.getTasksList) + // CDF version-range split: a Change Data Feed read carries no tasks; instead the per-partition + // DeltaScan packs this partition's inclusive cdf sub-range in a minimal common (cdf_read marks + // it). Splice that [start, end] over the shared common's full range so each partition's native + // TableChanges read covers only its slice. Regular (non-CDF) per-partition bytes set no common, + // so this is skipped and only the task list is merged. + if (partitionScan.hasCommon && partitionScan.getCommon.getCdfRead) { + val pc = partitionScan.getCommon + val mergedCommon = originalScan.getCommon.toBuilder + mergedCommon.setCdfStartVersion(pc.getCdfStartVersion) + if (pc.hasCdfEndVersion) mergedCommon.setCdfEndVersion(pc.getCdfEndVersion) + else mergedCommon.clearCdfEndVersion() + mergedScanBuilder.setCommon(mergedCommon.build()) + } + op.toBuilder.setDeltaScan(mergedScanBuilder.build()).build() + } +} diff --git a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingPhysicalNameReproSuite.scala b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingPhysicalNameReproSuite.scala new file mode 100644 index 0000000000..045c6faec2 --- /dev/null +++ b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingPhysicalNameReproSuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.delta + +// Deterministic mirror of DeltaColumnMappingSuite "column mapping batch scan should detect +// physical name changes" (id mode). df2 is analyzed before the table is overwritten with new +// physical names/field-ids; reading it afterward (schema-on-read check off) must yield NULLs. +// Native-only fresh collect (no vanilla-first collect, which would cache the pinned snapshot +// and mask the bug). +class CometDeltaColumnMappingPhysicalNameReproSuite extends CometDeltaTestBase { + + test("column mapping batch scan should detect physical name changes [id]") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withSQLConf("spark.databricks.delta.properties.defaults.columnMapping.mode" -> "id") { + withDeltaTable("cm_physical_name") { tablePath => + spark.range(10).toDF("id").write.format("delta").save(tablePath) + val df2 = spark.read.format("delta").load(tablePath) + df2.queryExecution.analyzed + withSQLConf( + "spark.databricks.delta.columnMapping.reuseColumnMetadataDuringOverwrite" -> "false") { + spark.range(10).toDF("id") + .write.format("delta").option("overwriteSchema", "true").mode("overwrite") + .save(tablePath) + } + withSQLConf("spark.databricks.delta.checkLatestSchemaOnRead" -> "false") { + val rows = df2.collect() + val nonNull = rows.count(!_.isNullAt(0)) + assert( + rows.length == 10 && nonNull == 0, + s"stale physical name should read NULL: ${rows.length} rows, $nonNull non-null " + + s"(sample=${rows.take(5).map(r => if (r.isNullAt(0)) "null" else r.getLong(0)).toSeq})") + } + } + } + } +} diff --git a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingSuite.scala b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingSuite.scala new file mode 100644 index 0000000000..36b4968661 --- /dev/null +++ b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaColumnMappingSuite.scala @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.delta + +import org.apache.spark.sql.comet.CometDeltaNativeScanExec + +/** + * Column mapping (name + id modes) and deletion-vector coverage. Ported from + * the pre-SPI `delta-kernel-phase-1` branch. + */ +class CometDeltaColumnMappingSuite extends CometDeltaTestBase { + + test("deletion vectors: accelerates DV-in-use tables via native DV filter") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("dv_accel") { tablePath => + val ss = spark + import ss.implicits._ + + (0 until 20) + .map(i => (i.toLong, s"name_$i")) + .toDF("id", "name") + .repartition(1) + .write + .format("delta") + .option("delta.enableDeletionVectors", "true") + .option("delta.minReaderVersion", "3") + .option("delta.minWriterVersion", "7") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + + spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id % 3 = 0") + + // orderBy forces AQE wrapping so Comet's prep rules see the plan. + val df = spark.read.format("delta").load(tablePath).orderBy("id") + val plan = df.queryExecution.executedPlan + val deltaScans = collect(plan) { case s: CometDeltaNativeScanExec => s } + assert( + deltaScans.nonEmpty, + s"expected Comet to accelerate a DV-in-use table:\n$plan") + val nativeRows = df.collect().toSeq.map(normalizeRow) + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = spark.read + .format("delta") + .load(tablePath) + .collect() + .toSeq + .map(normalizeRow) + assert( + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"native=$nativeRows\nvanilla=$vanillaRows") + } + assert(nativeRows.size == 13, s"expected 13 rows after DELETE, got ${nativeRows.size}") + + spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id >= 18") + val df2 = spark.read.format("delta").load(tablePath) + val rows2 = df2.collect().toSeq.map(normalizeRow) + // Assert against vanilla rather than a hardcoded size: in this Spark 4.1 + + // Delta 4.0 combination a second DELETE on the same parquet file where the + // newly-matched row count is small can end up reading the cached pre-DELETE + // snapshot in the same SparkSession. We mirror vanilla so the test gates on + // "native matches vanilla" rather than on Delta-version-specific transaction + // visibility semantics. + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaPost2 = spark.read.format("delta").load(tablePath) + .collect().toSeq.map(normalizeRow) + assert( + rows2.sortBy(_.mkString("|")) == vanillaPost2.sortBy(_.mkString("|")), + s"after 2nd DELETE: native=$rows2 vanilla=$vanillaPost2") + } + val plan2 = df2.queryExecution.executedPlan + assert( + collect(plan2) { case s: CometDeltaNativeScanExec => s }.nonEmpty, + s"expected Comet to still accelerate after second DELETE:\n$plan2") + } + } + + test("column mapping: name mode read after rename") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("col_mapping_name") { tablePath => + val ss = spark + import ss.implicits._ + + (0 until 8) + .map(i => (i.toLong, s"name_$i", i * 1.5)) + .toDF("id", "name", "score") + .write + .format("delta") + .option("delta.columnMapping.mode", "name") + .option("delta.minReaderVersion", "2") + .option("delta.minWriterVersion", "5") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + + spark.sql(s"ALTER TABLE delta.`$tablePath` RENAME COLUMN name TO full_name") + + assertDeltaNativeMatches(tablePath, identity) + assertDeltaNativeMatches(tablePath, _.select("id", "full_name")) + } + } + + test("column mapping: id mode") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("col_mapping_id") { tablePath => + val ss = spark + import ss.implicits._ + + (0 until 6) + .map(i => (i.toLong, s"name_$i")) + .toDF("id", "name") + .write + .format("delta") + .option("delta.columnMapping.mode", "id") + .option("delta.minReaderVersion", "2") + .option("delta.minWriterVersion", "5") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + assertDeltaNativeMatches(tablePath, _.where("id > 2")) + } + } + + test("column mapping + deletion vectors combined") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("col_map_dv") { tablePath => + val ss = spark + import ss.implicits._ + + (0 until 20) + .map(i => (i.toLong, s"name_$i", i * 1.5)) + .toDF("id", "name", "score") + .repartition(1) + .write + .format("delta") + .option("delta.columnMapping.mode", "name") + .option("delta.minReaderVersion", "3") + .option("delta.minWriterVersion", "7") + .option("delta.enableDeletionVectors", "true") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + + spark.sql(s"ALTER TABLE delta.`$tablePath` RENAME COLUMN name TO full_name") + withSQLConf("spark.databricks.delta.deletionVectors.useMetadataRowIndex" -> "false") { + spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id % 4 = 0") + val df = spark.read.format("delta").load(tablePath) + val nativeRows = df.collect().toSeq.map(normalizeRow) + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = spark.read + .format("delta") + .load(tablePath) + .collect() + .toSeq + .map(normalizeRow) + assert( + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"col mapping + DV: native=$nativeRows\nvanilla=$vanillaRows") + } + assert(nativeRows.size == 15, s"expected 15 rows after DELETE, got ${nativeRows.size}") + } + } + } + + test("column mapping + schema evolution combined") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("col_map_evolve") { tablePath => + val ss = spark + import ss.implicits._ + + (0 until 10) + .map(i => (i.toLong, s"name_$i")) + .toDF("id", "name") + .write + .format("delta") + .option("delta.columnMapping.mode", "name") + .option("delta.minReaderVersion", "2") + .option("delta.minWriterVersion", "5") + .save(tablePath) + + (10 until 15) + .map(i => (i.toLong, s"name_$i", i * 2.0)) + .toDF("id", "name", "score") + .write + .format("delta") + .mode("append") + .option("mergeSchema", "true") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + assertDeltaNativeMatches(tablePath, _.where("score IS NOT NULL")) + } + } +} diff --git a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaCoverageSuite.scala b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaCoverageSuite.scala new file mode 100644 index 0000000000..085e837b3f --- /dev/null +++ b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaCoverageSuite.scala @@ -0,0 +1,516 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.delta + +import org.apache.spark.sql.functions._ + +/** + * Coverage matrix for `CometDeltaNativeScanExec`. Each test exercises one query + * pattern (projection, filter, sort, aggregate, join, set-op, window, subquery, + * nested-data access) and asserts via [[CometDeltaTestBase.assertDeltaNativeMatches]] + * that BOTH: + * 1. the executed plan contains `CometDeltaNativeScanExec` (the contrib actually + * engaged -- a hard guard against the "inert bridge" class of regression + * we fixed earlier this branch), AND + * 2. results equal vanilla Spark+Delta (set-equal, order-independent). + * + * Tests are grouped roughly by SQL surface area so adding new coverage stays + * pattern-local. Per-area tests use a single backing Delta table built once at + * the top of the test to keep wall-clock fast. + */ +class CometDeltaCoverageSuite extends CometDeltaTestBase { + + // ---- Projection / SELECT -------------------------------------------------- + + test("projection: SELECT *") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_proj_star") { tablePath => + writeIntStrTable(tablePath, 10) + assertDeltaNativeMatches(tablePath, identity) + } + } + + test("projection: SELECT specific columns prunes data schema") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_proj_cols") { tablePath => + writeIntStrTable(tablePath, 10) + assertDeltaNativeMatches(tablePath, _.select("id")) + assertDeltaNativeMatches(tablePath, _.select("name")) + } + } + + test("projection: arithmetic + casts in SELECT") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_proj_arith") { tablePath => + writeIntStrTable(tablePath, 10) + assertDeltaNativeMatches( + tablePath, + _.selectExpr("id", "id * 2 AS doubled", "CAST(id AS INT) AS id_int", "length(name) AS nlen")) + } + } + + test("projection: LIMIT") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_proj_limit") { tablePath => + writeIntStrTable(tablePath, 50) + // limit is order-dependent; pair with orderBy and assert on a stable set. + assertDeltaNativeMatches(tablePath, _.orderBy("id").limit(5)) + } + } + + test("projection: DISTINCT") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_proj_distinct") { tablePath => + val ss = spark + import ss.implicits._ + Seq((1L, "a"), (1L, "a"), (2L, "b"), (3L, "c"), (3L, "c")) + .toDF("id", "name") + .write.format("delta").save(tablePath) + assertDeltaNativeMatches(tablePath, _.distinct()) + assertDeltaNativeMatches(tablePath, _.select("id").distinct()) + } + } + + // ---- Filters (WHERE) ------------------------------------------------------ + + test("filter: equality + inequality") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_filt_eq") { tablePath => + writeIntStrTable(tablePath, 20) + assertDeltaNativeMatches(tablePath, _.where("id = 5")) + assertDeltaNativeMatches(tablePath, _.where("id != 5")) + assertDeltaNativeMatches(tablePath, _.where("id > 10")) + assertDeltaNativeMatches(tablePath, _.where("id <= 7")) + } + } + + test("filter: IN / NOT IN") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_filt_in") { tablePath => + writeIntStrTable(tablePath, 20) + assertDeltaNativeMatches(tablePath, _.where("id IN (1, 3, 5, 7)")) + assertDeltaNativeMatches(tablePath, _.where("id NOT IN (0, 10, 19)")) + } + } + + test("filter: IS NULL / IS NOT NULL") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_filt_null") { tablePath => + val ss = spark + import ss.implicits._ + Seq((1L, Option("a")), (2L, None), (3L, Option("c")), (4L, None)) + .toDF("id", "name") + .write.format("delta").save(tablePath) + assertDeltaNativeMatches(tablePath, _.where("name IS NULL")) + assertDeltaNativeMatches(tablePath, _.where("name IS NOT NULL")) + } + } + + test("filter: BETWEEN, LIKE, AND/OR/NOT") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_filt_combo") { tablePath => + writeIntStrTable(tablePath, 20) + assertDeltaNativeMatches(tablePath, _.where("id BETWEEN 3 AND 8")) + assertDeltaNativeMatches(tablePath, _.where("name LIKE 'name_1%'")) + assertDeltaNativeMatches(tablePath, _.where("id > 5 AND id < 15")) + assertDeltaNativeMatches(tablePath, _.where("id < 3 OR id > 17")) + assertDeltaNativeMatches(tablePath, _.where("NOT (id = 10)")) + } + } + + // ---- Sorting -------------------------------------------------------------- + + test("sort: ORDER BY ASC / DESC, single + multi key") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_sort") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20).map(i => (i.toLong, s"g_${i % 3}", i % 5)) + .toDF("id", "grp", "v") + .write.format("delta").save(tablePath) + assertDeltaNativeMatches(tablePath, _.orderBy("id")) + assertDeltaNativeMatches(tablePath, _.orderBy(desc("id"))) + assertDeltaNativeMatches(tablePath, _.orderBy(asc("grp"), desc("id"))) + } + } + + // ---- Aggregations --------------------------------------------------------- + + test("aggregate: COUNT, SUM, AVG, MIN, MAX") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_agg_basic") { tablePath => + writeIntStrTable(tablePath, 20) + // NOTE: `count(*)` is intentionally NOT covered here -- Delta short-circuits + // it to a `LocalTableScan` using the snapshot's `numRecords` stat, so the + // scan never engages and `assertDeltaNativeMatches` would (correctly) fail. + // `count(id)` and other column-touching aggregates do need to read parquet + // and exercise the scan path. + assertDeltaNativeMatches(tablePath, _.agg(count("id").as("c"))) + assertDeltaNativeMatches(tablePath, _.agg(sum("id"), avg("id"), min("id"), max("id"))) + } + } + + test("aggregate: GROUP BY single + multi column, with HAVING") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_agg_group") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 30).map(i => (i.toLong, s"g_${i % 3}", i % 5)) + .toDF("id", "grp", "v") + .write.format("delta").save(tablePath) + assertDeltaNativeMatches(tablePath, _.groupBy("grp").agg(count("*").as("c"), sum("id").as("s"))) + assertDeltaNativeMatches(tablePath, _.groupBy("grp", "v").agg(count("*").as("c"))) + assertDeltaNativeMatches( + tablePath, + df => df.groupBy("grp").agg(count("*").as("c")).where("c > 5")) + } + } + + test("aggregate: COUNT DISTINCT") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_agg_cd") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 30).map(i => (i.toLong, s"g_${i % 4}")) + .toDF("id", "grp") + .write.format("delta").save(tablePath) + assertDeltaNativeMatches(tablePath, _.agg(countDistinct("grp").as("dg"))) + } + } + + // ---- Joins ---------------------------------------------------------------- + + test("join: self-join (inner)") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_join_self") { tablePath => + writeIntStrTable(tablePath, 10) + assertDeltaNativeMatches( + tablePath, + df => df.as("a").join(df.as("b"), col("a.id") === col("b.id")).select(col("a.id"))) + } + } + + test("join: inner / left outer / left semi between two delta tables") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_join_lhs") { lhsPath => + withDeltaTable("cov_join_rhs") { rhsPath => + val ss = spark + import ss.implicits._ + (0 until 10).map(i => (i.toLong, s"l_$i")).toDF("id", "l") + .write.format("delta").save(lhsPath) + Seq(1L, 3L, 5L, 7L, 9L, 11L).map(i => (i, s"r_$i")).toDF("id", "r") + .write.format("delta").save(rhsPath) + // For two-table queries we still want to verify BOTH scans are accelerated; + // assertDeltaNativeMatches checks at least one CometDeltaNativeScanExec. + // Run a series of join modes manually. + val l = spark.read.format("delta").load(lhsPath) + val r = spark.read.format("delta").load(rhsPath) + assertJoinAcceleratedAndMatches(lhsPath, rhsPath, "inner") + assertJoinAcceleratedAndMatches(lhsPath, rhsPath, "left") + assertJoinAcceleratedAndMatches(lhsPath, rhsPath, "leftsemi") + assertJoinAcceleratedAndMatches(lhsPath, rhsPath, "leftanti") + // Silence "unused" warning for l/r: + val _ = (l, r) + } + } + } + + private def assertJoinAcceleratedAndMatches( + lhsPath: String, + rhsPath: String, + joinType: String): Unit = { + def buildPlan(): org.apache.spark.sql.DataFrame = { + val l = spark.read.format("delta").load(lhsPath) + val r = spark.read.format("delta").load(rhsPath) + l.join(r, Seq("id"), joinType).orderBy("id") + } + val nativeDf = buildPlan() + val nativeRows = nativeDf.collect().toSeq.map(normalizeRow) + val plan = nativeDf.queryExecution.executedPlan + val deltaScans = collect(plan) { + case s: org.apache.spark.sql.comet.CometDeltaNativeScanExec => s + } + assert( + deltaScans.size >= 2, + s"$joinType join: expected >= 2 CometDeltaNativeScanExec, got ${deltaScans.size}\n$plan") + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = buildPlan().collect().toSeq.map(normalizeRow) + assert( + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"$joinType join: native != vanilla\nnative=$nativeRows\nvanilla=$vanillaRows") + } + } + + // ---- Set operations ------------------------------------------------------- + + test("setop: UNION / UNION ALL / INTERSECT / EXCEPT") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_setop_a") { aPath => + withDeltaTable("cov_setop_b") { bPath => + val ss = spark + import ss.implicits._ + (1 to 5).map(i => (i.toLong, s"x_$i")).toDF("id", "v") + .write.format("delta").save(aPath) + (4 to 8).map(i => (i.toLong, s"x_$i")).toDF("id", "v") + .write.format("delta").save(bPath) + def both(op: (org.apache.spark.sql.DataFrame, org.apache.spark.sql.DataFrame) + => org.apache.spark.sql.DataFrame): Unit = { + def build(): org.apache.spark.sql.DataFrame = { + val a = spark.read.format("delta").load(aPath) + val b = spark.read.format("delta").load(bPath) + op(a, b).orderBy("id") + } + val nativeRows = build().collect().toSeq.map(normalizeRow) + val plan = build().queryExecution.executedPlan + val deltaScans = collect(plan) { + case s: org.apache.spark.sql.comet.CometDeltaNativeScanExec => s + } + assert(deltaScans.nonEmpty, s"expected CometDeltaNativeScanExec in:\n$plan") + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = build().collect().toSeq.map(normalizeRow) + assert( + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"native=$nativeRows\nvanilla=$vanillaRows") + } + } + both((a, b) => a.union(b)) + both((a, b) => a.unionAll(b)) + both((a, b) => a.intersect(b)) + both((a, b) => a.except(b)) + } + } + } + + // ---- Window functions ----------------------------------------------------- + + test("window: row_number / rank / lag / lead") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_window") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20).map(i => (i.toLong, s"g_${i % 3}", i % 5)) + .toDF("id", "grp", "v") + .write.format("delta").save(tablePath) + val w = org.apache.spark.sql.expressions.Window + .partitionBy("grp") + .orderBy("id") + assertDeltaNativeMatches( + tablePath, + _.withColumn("rn", row_number().over(w)) + .withColumn("rk", rank().over(w)) + .withColumn("lg", lag("id", 1).over(w)) + .withColumn("ld", lead("id", 1).over(w))) + } + } + + // ---- Subqueries ----------------------------------------------------------- + + test("subquery: scalar subquery in WHERE") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_sub_scalar") { tablePath => + writeIntStrTable(tablePath, 20) + spark.read.format("delta").load(tablePath).createOrReplaceTempView("cov_sub_scalar") + val df = spark.sql( + "SELECT * FROM cov_sub_scalar WHERE id > (SELECT AVG(id) FROM cov_sub_scalar)") + val rows = df.collect().toSeq.map(normalizeRow) + val plan = df.queryExecution.executedPlan + val deltaScans = collect(plan) { + case s: org.apache.spark.sql.comet.CometDeltaNativeScanExec => s + } + assert(deltaScans.nonEmpty, s"expected CometDeltaNativeScanExec:\n$plan") + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + spark.read.format("delta").load(tablePath).createOrReplaceTempView("cov_sub_scalar_v") + val vanillaRows = spark.sql( + "SELECT * FROM cov_sub_scalar_v WHERE id > (SELECT AVG(id) FROM cov_sub_scalar_v)") + .collect().toSeq.map(normalizeRow) + assert( + rows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"native=$rows\nvanilla=$vanillaRows") + } + } + } + + test("subquery: IN subquery") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_sub_in_a") { aPath => + withDeltaTable("cov_sub_in_b") { bPath => + writeIntStrTable(aPath, 20) + val ss = spark + import ss.implicits._ + Seq(3L, 7L, 11L).toDF("k").write.format("delta").save(bPath) + spark.read.format("delta").load(aPath).createOrReplaceTempView("cov_a") + spark.read.format("delta").load(bPath).createOrReplaceTempView("cov_b") + val df = spark.sql("SELECT * FROM cov_a WHERE id IN (SELECT k FROM cov_b)") + val rows = df.collect().toSeq.map(normalizeRow) + val plan = df.queryExecution.executedPlan + val deltaScans = collect(plan) { + case s: org.apache.spark.sql.comet.CometDeltaNativeScanExec => s + } + assert(deltaScans.nonEmpty, s"expected CometDeltaNativeScanExec:\n$plan") + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = spark.sql("SELECT * FROM cov_a WHERE id IN (SELECT k FROM cov_b)") + .collect().toSeq.map(normalizeRow) + assert( + rows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"native=$rows\nvanilla=$vanillaRows") + } + } + } + } + + // ---- CTEs ----------------------------------------------------------------- + + test("CTE: WITH ... SELECT chain") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_cte") { tablePath => + writeIntStrTable(tablePath, 20) + spark.read.format("delta").load(tablePath).createOrReplaceTempView("cov_cte") + val df = spark.sql( + "WITH odd AS (SELECT * FROM cov_cte WHERE id % 2 = 1) " + + "SELECT count(*) AS c FROM odd") + val rows = df.collect().toSeq.map(normalizeRow) + val plan = df.queryExecution.executedPlan + val deltaScans = collect(plan) { + case s: org.apache.spark.sql.comet.CometDeltaNativeScanExec => s + } + assert(deltaScans.nonEmpty, s"expected CometDeltaNativeScanExec:\n$plan") + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = spark.sql( + "WITH odd AS (SELECT * FROM cov_cte WHERE id % 2 = 1) " + + "SELECT count(*) AS c FROM odd") + .collect().toSeq.map(normalizeRow) + assert(rows == vanillaRows, s"native=$rows\nvanilla=$vanillaRows") + } + } + } + + // ---- Coverage with partitioned tables ------------------------------------- + + test("partitioned: filter + projection on partition column") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_part") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 30).map(i => (i.toLong, s"v_$i", s"p_${i % 3}")) + .toDF("id", "v", "p") + .write.format("delta").partitionBy("p").save(tablePath) + assertDeltaNativeMatches(tablePath, _.where("p = 'p_1'")) + assertDeltaNativeMatches(tablePath, _.where("p = 'p_1' AND id > 10")) + assertDeltaNativeMatches(tablePath, _.select("p", "id")) + } + } + + // ---- Coverage with column-mapping enabled --------------------------------- + + test("column mapping (name): filter + project + agg") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_cm_name") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20).map(i => (i.toLong, s"name_$i", i * 1.0)) + .toDF("id", "name", "score") + .write + .format("delta") + .option("delta.columnMapping.mode", "name") + .option("delta.minReaderVersion", "2") + .option("delta.minWriterVersion", "5") + .save(tablePath) + assertDeltaNativeMatches(tablePath, _.where("id > 5").select("id", "name")) + assertDeltaNativeMatches(tablePath, _.agg(sum("score").as("s"))) + } + } + + // ---- Coverage with deletion vectors --------------------------------------- + + test("dv: projection + filter on DV-bearing table") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_dv") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 30) + .map(i => (i.toLong, s"n_$i")) + .toDF("id", "name") + .repartition(1) + .write + .format("delta") + .option("delta.enableDeletionVectors", "true") + .option("delta.minReaderVersion", "3") + .option("delta.minWriterVersion", "7") + .save(tablePath) + spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id % 5 = 0") + // `select("id")` and SUM go through assertDeltaNativeMatches (vanilla matches + // native in this configuration). + assertDeltaNativeMatches(tablePath, _.select("id")) + assertDeltaNativeMatches(tablePath, _.agg(sum("id").as("s"), min("id"), max("id"))) + // The `where("id > 10")` variant previously skipped the vanilla comparison + // because Spark's in-session DeltaLog snapshot cache could serve the vanilla + // read a stale pre-DELETE snapshot (rows the DV should hide). The cache is + // process-global and keyed by path, so clearing it forces both reads to + // re-resolve the post-DELETE snapshot, restoring a real correctness comparison. + org.apache.spark.sql.delta.DeltaLog.clearCache() + val df = spark.read.format("delta").load(tablePath) + .where("id > 10").select("id", "name") + val nativeRows = df.collect().toSeq.map(normalizeRow) + val plan = df.queryExecution.executedPlan + val deltaScans = collect(plan) { + case s: org.apache.spark.sql.comet.CometDeltaNativeScanExec => s + } + assert( + deltaScans.nonEmpty, + s"expected CometDeltaNativeScanExec on DV-bearing filtered read:\n$plan") + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + org.apache.spark.sql.delta.DeltaLog.clearCache() + val vanillaRows = spark.read.format("delta").load(tablePath) + .where("id > 10").select("id", "name").collect().toSeq.map(normalizeRow) + assert( + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"DV filtered native=$nativeRows vanilla=$vanillaRows") + } + } + } + + // ---- Nested data access --------------------------------------------------- + + test("nested: struct field + array element + map value access") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("cov_nested") { tablePath => + val ss = spark + import ss.implicits._ + Seq( + (1L, ("a", 1), Seq(10, 20, 30), Map("k1" -> 100, "k2" -> 200)), + (2L, ("b", 2), Seq(40, 50), Map("k1" -> 300))) + .toDF("id", "s", "arr", "m") + .write.format("delta").save(tablePath) + assertDeltaNativeMatches(tablePath, _.selectExpr("id", "s._1 AS s1", "s._2 AS s2")) + assertDeltaNativeMatches(tablePath, _.selectExpr("id", "arr[0] AS a0", "size(arr) AS asz")) + assertDeltaNativeMatches(tablePath, _.selectExpr("id", "m['k1'] AS mk1")) + } + } + + // ---- helpers -------------------------------------------------------------- + + private def writeIntStrTable(tablePath: String, n: Int): Unit = { + val ss = spark + import ss.implicits._ + (0 until n).map(i => (i.toLong, s"name_$i")) + .toDF("id", "name") + .write.format("delta").save(tablePath) + } +} diff --git a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaFeaturesSuite.scala b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaFeaturesSuite.scala new file mode 100644 index 0000000000..81f0c9ceeb --- /dev/null +++ b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaFeaturesSuite.scala @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.delta + +import org.apache.spark.sql.comet.CometDeltaNativeScanExec +import org.apache.spark.sql.functions._ + +/** + * Coverage for the special features the contrib supports beyond plain reads. + * Each test asserts BOTH that Comet's native plan engages AND that results match + * vanilla Spark, so future silent-disengagement bugs are caught. + * + * Mapped to the design-doc feature list: + * - Deletion Vectors (native DeltaDvFilterExec path) + * - Row tracking (synthesised + materialised cases) + * - Synthetic columns (__delta_internal_row_index) + * - input_file_name() and FileBlockHolder threading + * - Complex types (struct, array, map) + * - Joins and aggregations over Delta + * - Time travel by timestamp + * - Multi-append / multi-file scenarios + */ +class CometDeltaFeaturesSuite extends CometDeltaTestBase { + + // ---- Deletion Vectors ----------------------------------------------------- + + test("DV: native scan engages on DV-bearing tables after DELETE") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_dv") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20) + .map(i => (i.toLong, s"name_$i")) + .toDF("id", "name") + .repartition(1) + .write + .format("delta") + .option("delta.enableDeletionVectors", "true") + .option("delta.minReaderVersion", "3") + .option("delta.minWriterVersion", "7") + .save(tablePath) + + spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id % 3 = 0") + + val df = spark.read.format("delta").load(tablePath) + val rows = df.collect() + val plan = df.queryExecution.executedPlan + assert( + collect(plan) { case s: CometDeltaNativeScanExec => s }.nonEmpty, + s"expected Comet native scan on DV-bearing table:\n$plan") + assert(rows.length === 13, s"expected 13 rows after DELETE, got ${rows.length}") + } + } + + // ---- Row tracking (Phase-1 port) ------------------------------------------ + + test("row tracking: unmaterialised _metadata.row_id synthesised from baseRowId") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_rt_unmat") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 12) + .map(i => (i.toLong, s"name_$i")) + .toDF("id", "name") + .repartition(1) + .write + .format("delta") + .option("delta.enableRowTracking", "true") + .option("delta.minReaderVersion", "3") + .option("delta.minWriterVersion", "7") + .save(tablePath) + + // orderBy forces a shuffle -> AQE wraps -> Comet's prep rules fire + val df = spark.read + .format("delta") + .load(tablePath) + .selectExpr("id", "_metadata.row_id AS rid") + .orderBy("id") + val rows = df.collect().toSeq + val plan = df.queryExecution.executedPlan + assert( + collect(plan) { case s: CometDeltaNativeScanExec => s }.nonEmpty, + s"expected Comet to accelerate rowTracking scan:\n$plan") + + assert(rows.size == 12) + rows.zipWithIndex.foreach { case (row, idx) => + assert(row.getLong(1) == idx.toLong, s"row $idx: rid mismatch") + } + } + } + + // ---- Synthetic columns ---------------------------------------------------- + + test("synthetic: native scan engages when row tracking is enabled (provides _metadata.row_index)") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_synth") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 6) + .map(i => (i.toLong, s"n_$i")) + .toDF("id", "name") + .repartition(1) + .write + .format("delta") + .option("delta.enableRowTracking", "true") + .option("delta.minReaderVersion", "3") + .option("delta.minWriterVersion", "7") + .save(tablePath) + + // orderBy forces AQE wrapping so Comet's prep rules see this plan. + val df = spark.read.format("delta").load(tablePath) + .selectExpr("id", "_metadata.row_index AS ri") + .orderBy("id") + val rows = df.collect() + val plan = df.queryExecution.executedPlan + assert(rows.length === 6, s"expected 6 rows, got ${rows.length}") + assert( + collect(plan) { case s: CometDeltaNativeScanExec => s }.nonEmpty, + s"expected Comet to engage when _metadata.row_index is consumed:\n$plan") + } + } + + // ---- input_file_name() ---------------------------------------------------- + + test("input_file_name(): rows return the path of their source parquet file") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_ifn") { tablePath => + val ss = spark + import ss.implicits._ + // Two writes -> two files; each row's input_file_name should be one of them. + (0 until 5).map(i => (i.toLong, "a")) + .toDF("id", "src").repartition(1).write.format("delta").save(tablePath) + (5 until 10).map(i => (i.toLong, "b")) + .toDF("id", "src").repartition(1).write.format("delta").mode("append").save(tablePath) + + // orderBy forces AQE wrapping for Comet's rules to fire. + val df = spark.read.format("delta").load(tablePath) + .withColumn("ifn", input_file_name()) + .orderBy("id") + val rows = df.collect() + assert(rows.length === 10) + val distinctPaths = rows.map(_.getString(2)).toSet + assert(distinctPaths.size === 2, s"expected 2 source files, got $distinctPaths") + assert(distinctPaths.forall(_.contains("parquet")), s"non-parquet path: $distinctPaths") + } + } + + // ---- Complex types -------------------------------------------------------- + + test("complex types: struct, array, map round-trip through native scan") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_complex") { tablePath => + val ss = spark + import ss.implicits._ + Seq( + (1L, ("a", 1), Seq(10, 20), Map("k1" -> 100)), + (2L, ("b", 2), Seq(30), Map("k2" -> 200, "k3" -> 300))) + .toDF("id", "s", "arr", "m") + .write.format("delta").save(tablePath) + + // assertDeltaNativeMatches already asserts native plan presence + result parity. + assertDeltaNativeMatches(tablePath, identity) + // Reinforce: simple read explicitly verifies the contrib scan exec is present. + assertNativePlanContains( + spark.read.format("delta").load(tablePath), + "CometDeltaNativeScanExec") + } + } + + // ---- Aggregations + joins over Delta -------------------------------------- + + test("aggregation: count/sum over Delta uses native scan") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_agg") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 100).map(i => (i.toLong, i % 5, (i * 1.5).toDouble)) + .toDF("id", "g", "v") + .write.format("delta").save(tablePath) + + val df = spark.read.format("delta").load(tablePath) + .groupBy("g").agg(count("*").as("c"), sum("v").as("s")) + val plan = df.queryExecution.executedPlan + assert( + collect(plan) { case s: CometDeltaNativeScanExec => s }.nonEmpty, + s"expected Comet native scan in aggregation plan:\n$plan") + + val rows = df.collect().sortBy(_.getInt(0)) + assert(rows.length === 5) + rows.foreach(r => assert(r.getLong(1) === 20L)) + } + } + + test("join: self-join over Delta uses native scan twice") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_join") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20).map(i => (i.toLong, s"n_$i")) + .toDF("id", "name") + .write.format("delta").save(tablePath) + + val df = spark.read.format("delta").load(tablePath).alias("a") + .join( + spark.read.format("delta").load(tablePath).alias("b"), + col("a.id") === col("b.id") + 1) + val plan = df.queryExecution.executedPlan + val scans = collect(plan) { case s: CometDeltaNativeScanExec => s } + assert(scans.size >= 1, s"expected at least 1 native Delta scan in join plan:\n$plan") + assert(df.count() === 19) + } + } + + // ---- Time travel by timestamp --------------------------------------------- + + test("time travel by timestamp reads the older snapshot") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("features_tt_ts") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 5).map(i => (i.toLong, s"v0_$i")).toDF("id", "name") + .write.format("delta").save(tablePath) + // Sleep so timestampAsOf can distinguish the two commits. + Thread.sleep(1500) + val midTimestamp = new java.sql.Timestamp(System.currentTimeMillis()) + Thread.sleep(1500) + (5 until 10).map(i => (i.toLong, s"v1_$i")).toDF("id", "name") + .write.format("delta").mode("append").save(tablePath) + + val df = spark.read + .format("delta") + .option("timestampAsOf", midTimestamp.toString) + .load(tablePath) + // Materialise before inspecting the plan so AQE has finalized it. + val nativeRows = df.collect().toSeq.map(normalizeRow) + val plan = df.queryExecution.executedPlan + assert( + collect(plan) { case s: CometDeltaNativeScanExec => s }.nonEmpty, + s"expected Comet native scan in timestamp time-travel plan:\n$plan") + assert(nativeRows.size === 5) + // Compare content against vanilla at the same pinned timestamp. + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = spark.read.format("delta") + .option("timestampAsOf", midTimestamp.toString) + .load(tablePath).collect().toSeq.map(normalizeRow) + assert( + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"timestamp time-travel native=$nativeRows vanilla=$vanillaRows") + } + } + } +} diff --git a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaMarkerSuite.scala b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaMarkerSuite.scala index cab8890d26..70559f3775 100644 --- a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaMarkerSuite.scala +++ b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaMarkerSuite.scala @@ -23,50 +23,47 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, input_file_name} /** - * Coverage for the contrib-delta CLAIM/DECLINE layer (`DeltaScanRule` + `CometDeltaScanMarker`) - * that this unit introduces, independent of the native read path (the serde/exec land later). - * - * On this build there is no `CometDeltaNativeScan` serde, so `CometExecRule`'s `scanHandler` - * lookup returns `None` and a planted `CometDeltaScanMarker` is left in the plan executing as a - * vanilla Delta fallback. That makes the marker's PRESENCE the observable signal that the rule - * claimed the scan, and its absence the signal that the rule declined -- exactly what these tests - * assert. The native-read assertions live with the serde/exec unit. + * Coverage for the contrib-delta CLAIM/DECLINE path: `DeltaScanRule` plants a `CometDeltaScanMarker`, + * which -- now that the serde (`CometDeltaNativeScan`) is present in this unit -- `CometExecRule` + * CONVERTS into a `CometDeltaNativeScanExec` (a real native read). So a CLAIMED scan is observable as + * a `CometDeltaNativeScanExec` in the plan, and a DECLINED scan falls back to vanilla Spark (no native + * scan). (Before the serde landed, a claimed scan left the marker in the plan executing as a vanilla + * fallback; that earlier-unit behaviour is what changed here.) */ class CometDeltaMarkerSuite extends CometDeltaTestBase { - test("DeltaScanRule plants the marker on a plain Delta read (claim path active)") { + test("DeltaScanRule claims a plain Delta read and it engages the native scan") { assume(deltaSparkAvailable, "io.delta.spark not on the test classpath") - withDeltaTable("marker-planted") { tablePath => + withDeltaTable("claim-native") { tablePath => spark.range(0, 100).toDF("id").write.format("delta").save(tablePath) - val df = spark.read.format("delta").load(tablePath) - // Red-green vs the A.2 build: with `DeltaScanRule$` absent (A.2 bridge only) no marker is - // planted; this unit supplies the rule, so the marker appears (then falls back to vanilla). - assertMarkerPlanted(df) + // The rule claims the scan (plants the marker); with the serde present, CometExecRule converts + // the marker to a CometDeltaNativeScanExec -- so the engaged-native check is the claim signal. + assertKernelReadEngaged(tablePath) } } - test("marker is planted on a filtered/projected read and the fallback stays result-correct") { + test("a filtered/projected claimed read goes native and matches vanilla Spark") { assume(deltaSparkAvailable, "io.delta.spark not on the test classpath") - withDeltaTable("marker-fallback-correct") { tablePath => + withDeltaTable("claim-native-filtered") { tablePath => spark.range(0, 100).selectExpr("id", "id * 2 as v").write.format("delta").save(tablePath) - val query = (df: DataFrame) => df.filter("id > 10").select("id", "v") - // Assert the rule actually CLAIMS this query shape (catches a claim-path regression, not just - // a result mismatch -- a disengaged claim path would still match rows since both sides run - // vanilla), AND that the marker's vanilla fallback returns identical rows. - assertMarkerPlanted(query(spark.read.format("delta").load(tablePath))) - assertResultsMatchVanilla(tablePath, query) + // Asserts the read engages `CometDeltaNativeScanExec` AND results match vanilla -- catches a + // claim-path regression (no native scan) and a correctness regression in one shot. + assertDeltaNativeMatches(tablePath, (df: DataFrame) => df.filter("id > 10").select("id", "v")) } } - test("DeltaScanRule declines an input_file_name() projection (no marker, vanilla read)") { + test("DeltaScanRule declines an input_file_name() projection (falls back to vanilla, no native scan)") { assume(deltaSparkAvailable, "io.delta.spark not on the test classpath") withDeltaTable("decline-input-file-name") { tablePath => spark.range(0, 50).toDF("id").write.format("delta").save(tablePath) - // `input_file_name()` forces a fall back to vanilla (per-file provenance the native scan - // can't surface), so the rule declines and plants no marker. - val df = spark.read.format("delta").load(tablePath).select(col("id"), input_file_name()) - assertNoMarker(df) - assert(df.count() == 50L, "declined read must still return all rows via vanilla Spark") + // `input_file_name()` forces a fall back to vanilla (per-file provenance the native scan can't + // surface), so the rule declines, plants no marker, and no CometDeltaNativeScanExec appears. + val query = (df: DataFrame) => df.select(col("id"), input_file_name()) + assertDeltaFallback(tablePath, query) + assertNoMarker(query(spark.read.format("delta").load(tablePath))) + assert( + query(spark.read.format("delta").load(tablePath)).count() == 50L, + "declined read must still return all rows via vanilla Spark") } } } diff --git a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaNativeSuite.scala b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaNativeSuite.scala new file mode 100644 index 0000000000..d1e8770911 --- /dev/null +++ b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaNativeSuite.scala @@ -0,0 +1,490 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet.contrib.delta + +import org.apache.spark.sql.functions._ + +/** + * Core read tests for the native Delta Lake scan path. Covers basic reads, + * projections, filters, partitioning, schema evolution, time travel, complex + * types, and primitive type coverage. + * + * Column mapping and deletion vector tests live in + * [[CometDeltaColumnMappingSuite]]. Joins, aggregations, DPP, metrics, and + * other advanced queries belong in a follow-up `CometDeltaAdvancedSuite`. + * + * Ported from the pre-SPI `delta-kernel-phase-1` branch with no semantic + * changes -- this is the same vertical-slice coverage Phase-1 had, exercising + * the current `CometDeltaNativeScanExec` plan-rewrite path via + * [[CometDeltaTestBase#assertDeltaNativeMatches]]. + */ +class CometDeltaNativeSuite extends CometDeltaTestBase { + + test("kernel-read path (Phase 1b): plain table reads correctly and engages DeltaKernelScanExec") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("kernel_read_smoke") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 10) + .map(i => (i.toLong, s"name_$i", i * 1.5)) + .toDF("id", "name", "score") + .repartition(1) + .write + .format("delta") + .save(tablePath) + + // Correctness: the kernel-read result matches vanilla Spark (and stays on the native + // CometDeltaNativeScanExec, i.e. no Spark-side fallback). + assertDeltaNativeMatches(tablePath, identity) + // Routing: kernel-read via DeltaKernelScanExec is the only path, so it engaged. + assertKernelReadEngaged(tablePath) + } + } + + test("kernel-read path (Phase 1c #44): name-mode column-mapped table") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("kernel_read_cm") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 8) + .map(i => (i.toLong, s"name_$i", i * 1.5)) + .toDF("id", "name", "score") + .repartition(1) + .write + .format("delta") + .option("delta.columnMapping.mode", "name") + .option("delta.minReaderVersion", "2") + .option("delta.minWriterVersion", "5") + .save(tablePath) + // Rename so a logical name diverges from its physical name (the real column-mapping case). + spark.sql(s"ALTER TABLE delta.`$tablePath` RENAME COLUMN name TO full_name") + + // Force name-mode resolution: with parquet field-id read off, the kernel-read path reads by + // physical name and relabels to logical via the identity transform. + withSQLConf("spark.sql.parquet.fieldId.read.enabled" -> "false") { + assertDeltaNativeMatches(tablePath, identity) + assertKernelReadEngaged(tablePath) + // Projection of the renamed column also reads correctly. + assertDeltaNativeMatches(tablePath, _.select("id", "full_name")) + } + } + } + + test("kernel-read path (#47): nested column-mapped table with a nested rename") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("kernel_read_cm_nested") { tablePath => + spark.sql( + s"""CREATE TABLE delta.`$tablePath` ( + | id INT, + | s STRUCT, + | arr ARRAY>) + |USING delta + |TBLPROPERTIES ( + | 'delta.columnMapping.mode' = 'name', + | 'delta.minReaderVersion' = '2', + | 'delta.minWriterVersion' = '5')""".stripMargin) + spark.sql( + s"""INSERT INTO delta.`$tablePath` VALUES + |(1, NAMED_STRUCT('a', 10, 'b', 'x'), ARRAY(NAMED_STRUCT('x', 100))), + |(2, NULL, ARRAY()), + |(3, NAMED_STRUCT('a', 30, 'b', 'z'), ARRAY(NAMED_STRUCT('x', 300), NAMED_STRUCT('x', 301))) + |""".stripMargin) + // Rename a NESTED field so its logical name diverges from its physical name -- the case that + // requires the kernel-read path to physicalise + relabel at every nesting level (#47). + spark.sql(s"ALTER TABLE delta.`$tablePath` RENAME COLUMN s.a TO renamed_a") + + withSQLConf("spark.sql.parquet.fieldId.read.enabled" -> "false") { + assertDeltaNativeMatches(tablePath, _.orderBy("id")) + assertKernelReadEngaged(tablePath) + // Project into the renamed nested field (nested pruning + relabel on the kernel path). + assertDeltaNativeMatches(tablePath, _.select("id", "s.renamed_a").orderBy("id")) + assertDeltaNativeMatches(tablePath, _.select("id", "arr").orderBy("id")) + } + } + } + + test("kernel-read path (#48): zero-data-column read (partition-only count)") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("kernel_read_partonly") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 12) + .map(i => (i.toLong, i % 3, s"v$i")) + .toDF("id", "grp", "v") + .write + .format("delta") + .partitionBy("grp") + .save(tablePath) + // Partition-only aggregate: no data column is read, so the row count is driven from + // record_count (the parquet footer as fallback) -- exercises the #48 zero-data-column path. + assertDeltaNativeMatches( + tablePath, + _.groupBy("grp").agg(count("*").as("c")).orderBy("grp")) + assertKernelReadEngaged(tablePath) + // A bare row count over the partition column also reads no data columns. + assertDeltaNativeMatches(tablePath, _.select("grp").orderBy("grp")) + } + } + + test("kernel-read path (Phase 1c #44): id-mode column-mapped table") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("kernel_read_cm_id") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 8) + .map(i => (i.toLong, s"name_$i", i * 1.5)) + .toDF("id", "name", "score") + .repartition(1) + .write + .format("delta") + .option("delta.columnMapping.mode", "id") + .option("delta.minReaderVersion", "2") + .option("delta.minWriterVersion", "5") + .save(tablePath) + // Rename so a logical name diverges from its physical name. + spark.sql(s"ALTER TABLE delta.`$tablePath` RENAME COLUMN name TO full_name") + + // id-mode reads through the same rename-then-relabel kernel path; field ids ride along on + // the physical schema as a fallback matcher. + assertDeltaNativeMatches(tablePath, identity) + assertKernelReadEngaged(tablePath) + assertDeltaNativeMatches(tablePath, _.select("id", "full_name")) + } + } + + test("kernel-read path (Phase 1c #45): partitioned table with projections + filters") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("kernel_read_part") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 12) + .map(i => (i.toLong, s"name_$i", i % 3)) + .toDF("id", "name", "part") + .write + .format("delta") + .partitionBy("part") + .save(tablePath) + + // The scan outputs data ++ partition; the kernel exec reproduces that, so projections + // and partition filters all work without special handling. + assertDeltaNativeMatches(tablePath, identity) // SELECT * + assertKernelReadEngaged(tablePath) + assertDeltaNativeMatches(tablePath, _.select("id")) // data-only projection + assertDeltaNativeMatches(tablePath, _.select("id", "part")) // data + partition + assertDeltaNativeMatches(tablePath, _.select("part", "name")) // reordered + assertDeltaNativeMatches(tablePath, _.where("part = 1")) // partition filter + } + } + + test("kernel-read path (Phase 1c #46): _metadata columns + DELETE via deletion vectors") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("kernel_read_synth") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20) + .map(i => (i.toLong, s"v_$i")) + .toDF("id", "v") + .repartition(1) + .write + .format("delta") + .option("delta.enableDeletionVectors", "true") + .save(tablePath) + + // _metadata.* is synthesized in-worker by DeltaKernelScanExec. + assertDeltaNativeMatches(tablePath, _.select($"id", $"_metadata.file_path")) + // DELETE writes a deletion vector; the read applies it in-worker, and the surviving rows + // must match vanilla. + spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id % 4 = 0") + assertDeltaNativeMatches(tablePath, identity) + assertKernelReadEngaged(tablePath) + } + } + + test("read a tiny unpartitioned delta table via the native scan") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("smoke") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 10) + .map(i => (i.toLong, s"name_$i", i * 1.5)) + .toDF("id", "name", "score") + .repartition(1) + .write + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + // Explicit accelerator-coverage assertion: the contrib's scan exec must be + // in the plan. Guards against silent disengagement bugs. + assertNativePlanContains( + spark.read.format("delta").load(tablePath), + "CometDeltaNativeScanExec") + } + } + + test("multi-file delta table") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("multifile") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 30) + .map(i => (i.toLong, s"name_$i")) + .toDF("id", "name") + .repartition(3) + .write + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + } + } + + test("projection pushdown reads only selected columns") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("projection") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 10) + .map(i => (i.toLong, s"name_$i", i * 1.5, i % 2 == 0)) + .toDF("id", "name", "score", "active") + .repartition(1) + .write + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, _.select("id", "score")) + } + } + + test("partitioned delta table surfaces partition column values") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("partitioned") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 12) + .map(i => (i.toLong, s"name_$i", if (i < 6) "a" else "b")) + .toDF("id", "name", "category") + .write + .partitionBy("category") + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + } + } + + test("filter pushdown returns correct rows") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("filter") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20) + .map(i => (i.toLong, s"name_$i", i * 1.5)) + .toDF("id", "name", "score") + .repartition(2) + .write + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, _.where(col("id") >= 5 && col("id") < 15)) + } + } + + test("predicate variety: eq, lt, gt, is null, in, and/or") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("predicates") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 20) + .map(i => (i.toLong, if (i % 3 == 0) null else s"n_$i", i.toDouble)) + .toDF("id", "name", "score") + .repartition(1) + .write + .format("delta") + .save(tablePath) + + // eq + assertDeltaNativeMatches(tablePath, _.where(col("id") === 5)) + // lt + gt + assertDeltaNativeMatches(tablePath, _.where(col("id") < 7 || col("id") > 15)) + // is null + assertDeltaNativeMatches(tablePath, _.where(col("name").isNull)) + // in + assertDeltaNativeMatches(tablePath, _.where(col("id").isin(1L, 4L, 9L, 16L))) + // mixed + assertDeltaNativeMatches( + tablePath, + _.where((col("id") > 5 && col("id") < 12) || col("name").isNull)) + } + } + + test("empty delta table") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("empty") { tablePath => + val ss = spark + import ss.implicits._ + Seq.empty[(Long, String)] + .toDF("id", "name") + .write + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + } + } + + test("multiple appends produce many files, native scan reads them all") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("appends") { tablePath => + val ss = spark + import ss.implicits._ + for (batch <- 0 until 3) { + (0 until 10) + .map(i => ((batch * 10 + i).toLong, s"b${batch}_$i")) + .toDF("id", "name") + .repartition(1) + .write + .format("delta") + .mode("append") + .save(tablePath) + } + + assertDeltaNativeMatches(tablePath, identity) + } + } + + test("multi-column partitioning") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("multicol-part") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 16) + .map { i => + (i.toLong, s"n_$i", if (i < 8) "a" else "b", i % 4) + } + .toDF("id", "name", "p1", "p2") + .write + .partitionBy("p1", "p2") + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + // Filter that prunes one partition column + assertDeltaNativeMatches(tablePath, _.where(col("p1") === "a")) + // Filter that prunes both partition columns + assertDeltaNativeMatches(tablePath, _.where(col("p1") === "b" && col("p2") === 2)) + } + } + + test("typed partition columns: int, long, date") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("typed-partitions") { tablePath => + val ss = spark + import ss.implicits._ + (0 until 6) + .map { i => + ( + i.toLong, + s"n_$i", + i, // int partition + (1000L + i), // long partition + java.sql.Date.valueOf(s"2024-01-${i + 1}") // date partition + ) + } + .toDF("id", "name", "p_int", "p_long", "p_date") + .write + .partitionBy("p_int", "p_long", "p_date") + .format("delta") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + // Partition prune by date + assertDeltaNativeMatches( + tablePath, + _.where(col("p_date") === java.sql.Date.valueOf("2024-01-03"))) + } + } + + test("schema evolution: new column added in later commit") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("schema-evo") { tablePath => + val ss = spark + import ss.implicits._ + + // V0: two columns + (0 until 5) + .map(i => (i.toLong, s"n_$i")) + .toDF("id", "name") + .write + .format("delta") + .save(tablePath) + + // V1: add a column with schema-evolution enabled + ss.sql(s"ALTER TABLE delta.`$tablePath` ADD COLUMNS (extra INT)") + (5 until 10) + .map(i => (i.toLong, s"n_$i", Some(i * 100))) + .toDF("id", "name", "extra") + .write + .format("delta") + .mode("append") + .save(tablePath) + + assertDeltaNativeMatches(tablePath, identity) + } + } + + test("time travel by version reads the older snapshot") { + assume(deltaSparkAvailable, "delta-spark not on the test classpath; skipping") + withDeltaTable("tt-version") { tablePath => + val ss = spark + import ss.implicits._ + + // V0: 3 rows + (0 until 3).map(i => (i.toLong, s"v0_$i")).toDF("id", "name") + .write.format("delta").save(tablePath) + // V1: append 3 more + (3 until 6).map(i => (i.toLong, s"v1_$i")).toDF("id", "name") + .write.format("delta").mode("append").save(tablePath) + + // Read at version 0 -- should only see the original 3 rows. + val v0Native = + ss.read.format("delta").option("versionAsOf", "0").load(tablePath) + // Materialise BEFORE inspecting the plan so AQE's query-stage prep rules + // (incl. Comet's) have fired (see CometDeltaTestBase plan-ordering note). + val nativeRows = v0Native.collect().toSeq.map(normalizeRow) + val plan = v0Native.queryExecution.executedPlan + assert( + collect(plan) { + case s: org.apache.spark.sql.comet.CometDeltaNativeScanExec => s + }.nonEmpty, + s"expected CometDeltaNativeScanExec in time-travel v0 plan:\n$plan") + assert(nativeRows.size === 3) + // Compare CONTENT, not just count, against vanilla at the same pinned version, + // so a scan returning the right count from the wrong version is caught. + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { + val vanillaRows = ss.read.format("delta").option("versionAsOf", "0") + .load(tablePath).collect().toSeq.map(normalizeRow) + assert( + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"time-travel v0 native=$nativeRows vanilla=$vanillaRows") + } + } + } +} diff --git a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaTestBase.scala b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaTestBase.scala index 68c123dd70..a222dcc8a7 100644 --- a/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaTestBase.scala +++ b/contrib/delta/src/test/scala/org/apache/comet/contrib/delta/CometDeltaTestBase.scala @@ -24,6 +24,7 @@ import java.nio.file.Files import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.CometTestBase +import org.apache.spark.sql.comet.CometDeltaNativeScanExec import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.comet.CometSparkSessionExtensions @@ -32,12 +33,11 @@ import org.apache.comet.CometSparkSessionExtensions * Base trait for unit-testing the contrib-delta JVM layer. * * Wires up Spark+Delta in local mode with the contrib enabled (Comet + Delta session - * extensions, AQE forced on so Comet's query-stage-prep rules fire) and provides the - * claim/decline assertions this unit needs: `assertMarkerPlanted` / `assertNoMarker` - * (did `DeltaScanRule` claim the scan?) and `assertResultsMatchVanilla` (does the - * marker's fallback / a decline stay result-correct?). The native-read assertions - * (`assertDeltaNativeMatches` etc.) land with the serde/exec unit, since they need - * `CometDeltaNativeScanExec`. + * extensions, AQE forced on so Comet's query-stage-prep rules fire) and provides both the + * native-read assertions (`assertDeltaNativeMatches` / `assertKernelReadEngaged` / + * `assertDeltaFallback` / `assertNativePlanContains` -- a claimed scan engages + * `CometDeltaNativeScanExec`) and the decline assertion (`assertNoMarker` -- a declined scan plants + * no `CometDeltaScanMarker` and runs on vanilla Spark). */ trait CometDeltaTestBase extends CometTestBase with AdaptiveSparkPlanHelper { @@ -139,20 +139,6 @@ trait CometDeltaTestBase extends CometTestBase with AdaptiveSparkPlanHelper { collect(df.queryExecution.executedPlan) { case p if p.getClass.getName == MarkerClass => p } } - /** - * Assert that `DeltaScanRule` CLAIMED the scan: the executed plan contains a `CometDeltaScanMarker`. - * On a build without the serde (this unit), `CometExecRule`'s `scanHandler` lookup returns `None`, - * so the marker is left in the plan and executes as a vanilla Delta fallback -- which is exactly the - * claim signal we assert here. (A build with only the A.2 bridge and no `DeltaScanRule$` would NOT - * plant the marker, so this is red there / green here.) - */ - protected def assertMarkerPlanted(df: DataFrame): Unit = { - val markers = markersIn(df) - assert( - markers.nonEmpty, - s"expected a CometDeltaScanMarker in the plan, got:\n${df.queryExecution.executedPlan}") - } - /** * Assert the rule DECLINED the scan (no `CometDeltaScanMarker` planted) -- the read runs as a * vanilla Spark Delta scan. Used for the decline-path cases (unsupported projection, encryption, @@ -166,28 +152,77 @@ trait CometDeltaTestBase extends CometTestBase with AdaptiveSparkPlanHelper { } /** - * Assert the Delta read at `tablePath` returns the same rows whether the native claim path is on - * or off -- i.e. the marker's vanilla fallback (and any decline) is result-correct. Order-independent. + * Assert that `df`'s executed plan (after a forced `.collect()` so AQE materialises Comet's rules) + * contains an operator whose simple class name matches each name in `expectedExecs`. Uses the + * AQE-aware `collect` (from `AdaptiveSparkPlanHelper`) so it descends into the + * `AdaptiveSparkPlanExec` wrapper that every real exec lives inside under the AQE-forced-on config. */ - protected def assertResultsMatchVanilla( + protected def assertNativePlanContains(df: DataFrame, expectedExecs: String*): Unit = { + df.collect() + val plan = df.queryExecution.executedPlan + val present = collect(plan) { case p => p.getClass.getSimpleName }.toSet + val missing = expectedExecs.filterNot(present.contains) + assert( + missing.isEmpty, + s"expected execs missing from plan: ${missing.mkString(", ")}\n" + + s"present execs: ${present.mkString(", ")}\nfull plan:\n$plan") + } + + /** + * Run `query` against the Delta table at `tablePath` with the native scan engaged, assert the + * executed plan contains a `CometDeltaNativeScanExec` (the read went native), and that the rows + * match vanilla Spark's (order-independent). + */ + protected def assertDeltaNativeMatches( tablePath: String, query: DataFrame => DataFrame): Unit = { - val withClaim = query(spark.read.format("delta").load(tablePath)) - .collect() - .toSeq - .map(normalizeRow) + val native = query(spark.read.format("delta").load(tablePath)) + // Materialise first so AQE runs its query-stage prep rules (Comet's CometScanRule fires lazily + // when AQE materialises a stage); after collect, executedPlan reflects the finalized plan. + val nativeRows = native.collect().toSeq.map(normalizeRow) + val plan = native.queryExecution.executedPlan + val deltaScans = collect(plan) { case s: CometDeltaNativeScanExec => s } + assert(deltaScans.nonEmpty, s"expected CometDeltaNativeScanExec in plan, got:\n$plan") + withSQLConf("spark.comet.scan.deltaNative.enabled" -> "false") { - val vanilla = query(spark.read.format("delta").load(tablePath)) + val vanillaRows = query(spark.read.format("delta").load(tablePath)) .collect() .toSeq .map(normalizeRow) assert( - withClaim.sortBy(_.mkString("|")) == vanilla.sortBy(_.mkString("|")), - s"claim-path result did not match vanilla Spark result\n" + - s"withClaim=$withClaim\nvanilla=$vanilla") + nativeRows.sortBy(_.mkString("|")) == vanillaRows.sortBy(_.mkString("|")), + s"native result did not match vanilla Spark result\nnative=$nativeRows\nvanilla=$vanillaRows") } } + /** + * Like `assertDeltaNativeMatches` but asserts the native plan SHOULD fall back: no + * `CometDeltaNativeScanExec` appears (the read ran on vanilla Spark). + */ + protected def assertDeltaFallback( + tablePath: String, + query: DataFrame => DataFrame): Unit = { + val attempt = query(spark.read.format("delta").load(tablePath)) + attempt.collect() + val plan = attempt.queryExecution.executedPlan + val deltaScans = collect(plan) { case s: CometDeltaNativeScanExec => s } + assert( + deltaScans.isEmpty, + s"expected fallback (no CometDeltaNativeScanExec) but plan was:\n$plan") + } + + /** + * Assert the native kernel-read path engaged: the plan carries a `CometDeltaNativeScanExec` rather + * than falling back to vanilla Spark. + */ + protected def assertKernelReadEngaged(tablePath: String): Unit = { + val df = spark.read.format("delta").load(tablePath) + df.collect() // materialize so AQE / Comet rules finalize the plan + val plan = df.queryExecution.executedPlan + val scans = collect(plan) { case s: CometDeltaNativeScanExec => s } + assert(scans.nonEmpty, s"expected CometDeltaNativeScanExec in plan, got:\n$plan") + } + protected def normalizeRow(row: Row): Seq[Any] = row.toSeq.map(normalizeValue) From 0ee301f73c228c473aed3dd7c146112369e60570 Mon Sep 17 00:00:00 2001 From: Scott Schenkein Date: Mon, 29 Jun 2026 09:26:45 -0400 Subject: [PATCH 2/2] fix(contrib-delta): partition injection-slot ordering + _metadata collision guard [#30 + themeA, folded into A.4b] --- .../contrib/delta/CometDeltaNativeScan.scala | 74 ++++++++++++++++--- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala b/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala index db0fde87e9..899b007de7 100644 --- a/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala +++ b/contrib/delta/src/main/scala/org/apache/comet/contrib/delta/CometDeltaNativeScan.scala @@ -89,6 +89,25 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] wit private[delta] val PerFileMetadataNames: Set[String] = SparkFileMetadataNames ++ PerFileRowTrackingNames + // Spark marks a `_metadata.*` virtual column's StructField with `__file_source_metadata_col` (and + // `__metadata_col`). A REAL data column that merely happens to share one of the SparkFileMetadataNames + // (e.g. a user table with a `file_name` column) carries NEITHER. So strip a `file_*` name from the + // kernel read schema ONLY when the field is actually a Spark file-metadata virtual column -- else a + // genuine `file_name`/`file_path`/... data column would be dropped from the read while the proto's + // required_schema keeps it ("missing kernel data-column schemas"). Mirrors `Attribute.isMetadataCol`. + private[delta] def isSparkFileMetadataField(f: StructField): Boolean = + f.metadata.contains("__file_source_metadata_col") || f.metadata.contains("__metadata_col") + + // A field is a synthetic/virtual read column to STRIP from the kernel projection iff its name is in + // `stripNames` AND -- for the file-metadata names that can collide with real user columns -- it + // actually carries the Spark file-metadata marker. + private[delta] def isStrippableSynthetic(f: StructField, stripNames: Set[String]): Boolean = { + val lc = f.name.toLowerCase(Locale.ROOT) + if (!stripNames.contains(lc)) false + else if (SparkFileMetadataNames.contains(lc)) isSparkFileMetadataField(f) + else true + } + /** * `kind` string for the `ContribOp` envelope this serde produces. The native side's * `comet-contrib-delta` rlib registers `DeltaScanPlanner` under this same kind via @@ -148,6 +167,33 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] wit } } + /** + * Splice partition columns into a kernel read (logical) schema at the position kernel's per-file + * transform INJECTS them: immediately after the last field that advances kernel's + * `last_physical_field` -- every read field EXCEPT a RowId metadata column, which kernel resolves + * via `GenerateRowId` (coalesce(materialised, baseRowId+row_index)) and deliberately does NOT + * advance past. Kernel's expression evaluator (`evaluate_struct_patch_expression`) labels its + * output columns POSITIONALLY against this shipped logical schema, so the partition columns must + * occupy exactly that emission slot. Appending them last instead lands the Int32 partition literal + * in the row_id slot and the Long row_id in the partition slot -- the #30 column swap (visible as + * row_id == partition on a partitioned row-tracking table). `RowIndex` / `RowCommitVersion` DO + * advance `last_physical_field`, so only RowId metadata columns are special-cased here. + */ + private def spliceKernelPartitions( + dataFields: Array[StructField], + partitionFields: Array[StructField]): Array[StructField] = { + if (partitionFields.isEmpty) { + dataFields + } else { + def isKernelRowId(f: StructField): Boolean = + f.metadata.contains(KernelMetadataSpecKey) && + f.metadata.getString(KernelMetadataSpecKey) == KernelRowIdSpec + val cut = dataFields.lastIndexWhere(f => !isKernelRowId(f)) + 1 + val (before, after) = dataFields.splitAt(cut) + before ++ partitionFields ++ after + } + } + /** * Translate Delta's `delta.columnMapping.id` metadata key to Spark+parquet's standard * `parquet.field.id` key on every StructField at every level of nesting. Required for @@ -223,8 +269,7 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] wit SyntheticReadFieldNames - DeltaReflection.RowIdColumnName.toLowerCase(Locale.ROOT) - DeltaReflection.RowCommitVersionColumnName.toLowerCase(Locale.ROOT) - val dataFields = requiredSchema.fields.filterNot(f => - stripNames.contains(f.name.toLowerCase(Locale.ROOT))) + val dataFields = requiredSchema.fields.filterNot(f => isStrippableSynthetic(f, stripNames)) if (dataFields.isEmpty) { // Zero data columns (partition-only / synthetic-only reads): no kernel read schema; the // executor drives the row count without a parquet read and the partition columns are filled @@ -241,7 +286,8 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] wit // so column-mapping physical names / field-ids ride along. The AddFiles route passes an empty // `partitionSchema` (its identity transform can't inject partitions, so partitions stay // Comet-appended there until that route also moves to kernel enumeration). - val projected0 = dataFields.map(pick) ++ partitionSchema.fields.map(pick) + val data0 = dataFields.map(pick) + val parts = partitionSchema.fields.map(pick) // Materialised row-id columns (`_row-id-col-*`, added by OPTIMIZE/UPDATE/MERGE) are matched by // NAME and carry NO column-mapping annotation. Under ACTIVE column mapping kernel's logical // with_schema requires both physicalName AND id on every regular field, so shipping the @@ -256,11 +302,14 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] wit // no kernel metadata-column support (Error::unsupported), so `_row-commit-version-col-*` is left // as-is. See state_info.rs RowId handling + CometDeltaRowTrackingMaterializedSuite (M3). val columnMappingActive = - projected0.exists(_.metadata.contains(DeltaReflection.PhysicalNameMetadataKey)) - val projected = - if (columnMappingActive) projected0.map(asKernelRowIdMetadataColumnIfMaterialized) - else projected0 - StructType(projected).json + (data0 ++ parts).exists(_.metadata.contains(DeltaReflection.PhysicalNameMetadataKey)) + val data = + if (columnMappingActive) data0.map(asKernelRowIdMetadataColumnIfMaterialized) else data0 + // Splice partitions at kernel's injection slot (after the last non-RowId field), NOT appended + // last -- otherwise the positional output labeling swaps an Int32 partition with the Long + // row_id under active CM (the materialised row-id became a RowId metadata column above). See + // `spliceKernelPartitions` / #30. + StructType(spliceKernelPartitions(data, parts)).json } } @@ -298,8 +347,7 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] wit n.equalsIgnoreCase(DeltaReflection.RowIdColumnName) || n.toLowerCase(Locale.ROOT).startsWith("_row-id-col-") val kept: Array[StructField] = requiredSchema.fields.flatMap { f => - val lc = f.name.toLowerCase(Locale.ROOT) - if (workerOnly.contains(lc)) { + if (isStrippableSynthetic(f, workerOnly)) { None // worker-side constant; not read from kernel } else if (isRowIndex(f.name)) { Some(asKernelMetadataColumn(f.name, "row_index")) @@ -312,7 +360,11 @@ object CometDeltaNativeScan extends CometOperatorSerde[CometDeltaScanMarker] wit Some(pick(f)) } } - val all = kept ++ partitionSchema.fields.map(pick) + // Splice partitions at kernel's injection slot (after the last non-RowId field), NOT appended + // last: `_metadata.row_id` is shipped as a kernel RowId metadata column (line above), and kernel + // injects the partition literal BEFORE it. Appending partitions last makes the executor's + // positional labeling swap the Int32 partition value with the Long row_id -- the #30 column swap. + val all = spliceKernelPartitions(kept, partitionSchema.fields.map(pick)) if (all.isEmpty) "" else StructType(all).json }