From c38d083f8d11e13a805fa480852da7ffff2de7c6 Mon Sep 17 00:00:00 2001 From: MarioCadenas Date: Thu, 11 Jun 2026 12:12:53 +0200 Subject: [PATCH 1/4] fix(type-generator): support IDENTIFIER() params via describe-time sample values (#383) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Type generation runs DESCRIBE QUERY with placeholder defaults instead of bound parameters, so an untyped `:param` collapses to `''`. For dynamic table names like `IDENTIFIER(:target_catalog || '.schema.table')` that yields `IDENTIFIER('' || '.schema.table')`, which Databricks rejects with PARSE_SYNTAX_ERROR and aborts the whole build — with no portable workaround. Add an optional describe-time sample value to `@param` annotations: -- @param target_catalog STRING = main SELECT * FROM IDENTIFIER(:target_catalog || '.sales.nation') The value is substituted ONLY during DESCRIBE so typegen resolves a real table and infers result columns; at runtime the analytics plugin still binds the actual parameter, so the query stays portable across environments. - extractParameterDefaults() parses `= value`; string/DATE/TIMESTAMP values are quoted (with `'` doubled), numeric/boolean/already-quoted values pass through - substituteParametersForDescribe() centralizes substitution (sample value wins over the type default) — extracted as a pure, unit-tested function - DESCRIBE failures on IDENTIFIER queries now append a remediation hint pointing at the `= value` annotation - docs: analytics SQL parameters + type-generation guide Tests: 16 new cases (extractParameterDefaults + IDENTIFIER substitution); full appkit suite (2149) passes. Signed-off-by: MarioCadenas --- docs/docs/development/type-generation.md | 11 ++ docs/docs/plugins/analytics.md | 24 ++++ .../src/type-generator/query-registry.ts | 127 ++++++++++++++++-- .../tests/query-registry.test.ts | 100 ++++++++++++++ 4 files changed, 252 insertions(+), 10 deletions(-) diff --git a/docs/docs/development/type-generation.md b/docs/docs/development/type-generation.md index c715791a9..48d239ff9 100644 --- a/docs/docs/development/type-generation.md +++ b/docs/docs/development/type-generation.md @@ -94,6 +94,17 @@ The type generator: 4. Generates TypeScript interfaces for query parameters and results 5. Creates a `QueryRegistry` type for type-safe query execution +### Parameters during `DESCRIBE QUERY` + +Type generation describes each query without binding real parameters, so it +substitutes a placeholder default for every `:param` (e.g. `''` for a string). +That breaks queries whose shape depends on a value — most notably dynamic table +names via `IDENTIFIER(:catalog || '.schema.table')`. Annotate such parameters +with a sample value (`-- @param catalog STRING = main`) so the describe call can +resolve a real table. The sample value is used only at type-generation time; the +runtime query still binds the actual parameter. See +[SQL parameters → Sample values](../plugins/analytics.md#sample-values-for-type-generation). + ## Using generated types Once types are generated, your IDE will provide autocomplete and type checking: diff --git a/docs/docs/plugins/analytics.md b/docs/docs/plugins/analytics.md index 3198d4340..a1837f3b6 100644 --- a/docs/docs/plugins/analytics.md +++ b/docs/docs/plugins/analytics.md @@ -61,6 +61,30 @@ at the call site. - `FLOAT`, `DOUBLE` — bind via `sql.float()` / `sql.double()` - `NUMERIC`, `DECIMAL` — bind via `sql.numeric()` (pass strings for precision) +### Sample values for type generation + +Some queries only have a valid shape once a parameter has a concrete value — most +commonly a dynamic table name built with `IDENTIFIER()`. During type generation +AppKit runs `DESCRIBE QUERY` with placeholder defaults, so an unresolved parameter +collapses to an empty string and produces invalid SQL +(`IDENTIFIER('' || '.schema.table')` → `PARSE_SYNTAX_ERROR`). + +Append `= value` to a `-- @param` annotation to give type generation a sample +value. It is used **only** while describing the query; at runtime the real +parameter is still bound, so the query stays portable across environments: + +```sql +-- @param target_catalog STRING = main +SELECT * +FROM IDENTIFIER(:target_catalog || '.sales.nation') +``` + +Type generation describes `main.sales.nation` to infer the result columns, while +the deployed app binds whatever catalog the caller passes. String, `DATE`, and +`TIMESTAMP` values are quoted automatically (`= main` → `'main'`); numeric and +boolean values are used verbatim (`= 100`, `= true`). Already-quoted values are +left as-is. + ## Server-injected parameters `:workspaceId` is **injected by the server** and **must not** be annotated: diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index ce0cdd0b7..c3801a274 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -393,6 +393,116 @@ export function defaultForType(sqlType: string | undefined): string { } } +/** + * Format a user-supplied sample value as a SQL literal for substitution. + * String-like types are wrapped in single quotes (with `'` doubled) unless the + * value is already a quoted literal; numeric/boolean/binary values pass through + * verbatim so callers can write `= 5`, `= true`, or `= X'00'` directly. + */ +function formatSampleValue(sqlType: string | undefined, raw: string): string { + if (raw.startsWith("'") && raw.endsWith("'") && raw.length >= 2) { + return raw; + } + switch (sqlType?.toUpperCase()) { + case "STRING": + case "DATE": + case "TIMESTAMP": + case "TIMESTAMP_NTZ": + return `'${raw.replace(/'/g, "''")}'`; + default: + return raw; + } +} + +/** + * Parse optional describe-time sample values from `@param` annotations, e.g. + * `-- @param target_catalog STRING = main`. The value is substituted into the + * SQL **only during DESCRIBE QUERY** so type generation can resolve queries + * whose shape depends on a parameter value — most notably dynamic table names + * via `IDENTIFIER(:target_catalog || '.schema.table')`, where the empty-string + * default would otherwise produce malformed SQL. Runtime binding is unaffected: + * the analytics plugin still binds the real parameter at execution time, so the + * query stays portable across environments. + * + * Returns a map of parameter name to the formatted SQL literal to substitute. + */ +export function extractParameterDefaults(sql: string): Record { + const defaults: Record = {}; + // Mirrors extractParameterTypes' type alternation, then requires `= ` + // through end-of-line. Lines without a value are left to extractParameterTypes. + const regex = + /--\s*@param\s+(\w+)\s+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)\s*=\s*(.+?)\s*$/gim; + for (const match of sql.matchAll(regex)) { + const [, paramName, paramType, rawValue] = match; + defaults[paramName] = formatSampleValue(paramType, rawValue); + } + return defaults; +} + +/** + * Replace `:param` placeholders with describe-time literals so `DESCRIBE QUERY` + * can run without bound parameters. Resolution order per parameter: + * 1. An explicit `-- @param name TYPE = value` sample value (wins), which lets + * dynamic table names via `IDENTIFIER(...)` resolve to a real table. + * 2. Otherwise a placeholder default derived from the annotated/inferred type. + * Placeholders inside string literals or comments are left untouched. + */ +export function substituteParametersForDescribe(sql: string): string { + const protectedRanges = getProtectedRanges(sql); + const annotatedTypes = extractParameterTypes(sql); + const inferredTypes = inferParameterTypes(sql, protectedRanges); + const parameterTypes = { ...inferredTypes, ...annotatedTypes }; + const parameterDefaults = extractParameterDefaults(sql); + return sql.replace( + /(? { + if (isInsideProtectedRange(offset, protectedRanges)) { + return original; + } + const sampleValue = parameterDefaults[paramName]; + if (sampleValue !== undefined) { + return sampleValue; + } + return defaultForType(parameterTypes[paramName]); + }, + ); +} + +/** + * Append a remediation hint when a DESCRIBE failure looks like a dynamic + * identifier that couldn't be resolved: the query calls `IDENTIFIER(...)` and + * has at least one parameter without a describe-time sample value. These fail + * because typegen substitutes a placeholder default (e.g. `''`) that yields a + * malformed or non-existent table name. Steering the user to the `= value` + * annotation turns the fatal error into a one-line fix. + */ +function withIdentifierHint( + error: { code?: string; message: string }, + sql: string, +): { code?: string; message: string } { + if (!/\bIDENTIFIER\s*\(/i.test(sql)) { + return error; + } + const protectedRanges = getProtectedRanges(sql); + const params = extractParameters(sql, protectedRanges); + const defaults = extractParameterDefaults(sql); + const unresolved = params.filter( + (p) => !SERVER_INJECTED_PARAMS.includes(p) && defaults[p] === undefined, + ); + if (unresolved.length === 0) { + return error; + } + const example = unresolved[0]; + return { + ...error, + message: `${error.message}\n Hint: this query uses IDENTIFIER() with parameter(s) ${unresolved + .map((p) => `:${p}`) + .join( + ", ", + )}. Give type generation a sample value so it can resolve the table, e.g. \`-- @param ${example} STRING = my_catalog\`. The runtime query still binds the real parameter.`, + }; +} + /** * Infer parameter types from positional context in SQL. * V1 only infers NUMERIC from patterns like LIMIT, OFFSET, TOP, @@ -512,20 +622,17 @@ export async function generateQueriesFromDescribe( const annotatedTypes = extractParameterTypes(sql); const inferredTypes = inferParameterTypes(sql, protectedRanges); const parameterTypes = { ...inferredTypes, ...annotatedTypes }; - const sqlWithDefaults = sql.replace( - /(? { - if (isInsideProtectedRange(offset, protectedRanges)) { - return original; - } - return defaultForType(parameterTypes[paramName]); - }, - ); + // Explicit describe-time sample values (`-- @param name TYPE = value`) + // take precedence over the type-based default so queries with dynamic + // table names (IDENTIFIER) can resolve a real table during DESCRIBE. + const parameterDefaults = extractParameterDefaults(sql); + const sqlWithDefaults = substituteParametersForDescribe(sql); // Warn about unresolved parameters const allParams = extractParameters(sql, protectedRanges); for (const param of allParams) { if (SERVER_INJECTED_PARAMS.includes(param)) continue; + if (parameterDefaults[param]) continue; if (parameterTypes[param]) continue; logger.warn( '%s: parameter ":%s" has no type annotation or inference. Add %s to the query file.', @@ -711,7 +818,7 @@ export async function generateQueriesFromDescribe( status: "syntax", index, schema: { name: queryName, type }, - error: parseError(sqlError), + error: withIdentifierHint(parseError(sqlError), sql), }; } diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index b149d5bbe..e961c1f57 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -2,12 +2,14 @@ import { describe, expect, test } from "vitest"; import { convertToQueryType, defaultForType, + extractParameterDefaults, extractParameters, extractParameterTypes, getProtectedRanges, inferParameterTypes, normalizeTypeName, SERVER_INJECTED_PARAMS, + substituteParametersForDescribe, } from "../query-registry"; import type { DatabricksStatementExecutionResponse } from "../types"; @@ -590,3 +592,101 @@ describe("substitution skips protected ranges", () => { expect(result).toContain("id = ''"); }); }); + +describe("extractParameterDefaults", () => { + test("returns empty object when no sample values are present", () => { + const sql = + "-- @param startDate DATE\nSELECT * FROM t WHERE d = :startDate"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("quotes string-like sample values", () => { + const sql = [ + "-- @param target_catalog STRING = main", + "-- @param since DATE = 2024-01-01", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + target_catalog: "'main'", + since: "'2024-01-01'", + }); + }); + + test("passes numeric and boolean sample values through verbatim", () => { + const sql = [ + "-- @param limit INT = 100", + "-- @param ratio DOUBLE = 0.5", + "-- @param active BOOLEAN = true", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + limit: "100", + ratio: "0.5", + active: "true", + }); + }); + + test("leaves already-quoted values untouched and escapes embedded quotes", () => { + const sql = [ + "-- @param a STRING = 'pre-quoted'", + "-- @param b STRING = O'Brien", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + a: "'pre-quoted'", + b: "'O''Brien'", + }); + }); + + test("ignores @param lines without a value", () => { + const sql = "-- @param onlyType STRING\nSELECT 1"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); +}); + +describe("substituteParametersForDescribe (IDENTIFIER support, #383)", () => { + test("empty-string default produces malformed SQL for IDENTIFIER params", () => { + // Reproduces the bug: with no sample value, an untyped IDENTIFIER param + // collapses to '' and yields IDENTIFIER('' || '.schema.table'), a leading-dot + // identifier that Databricks rejects with PARSE_SYNTAX_ERROR. + const sql = "SELECT * FROM IDENTIFIER(:target_catalog || '.sales.nation')"; + expect(substituteParametersForDescribe(sql)).toBe( + "SELECT * FROM IDENTIFIER('' || '.sales.nation')", + ); + }); + + test("sample value resolves IDENTIFIER to a real, describable table", () => { + const sql = [ + "-- @param target_catalog STRING = main", + "SELECT * FROM IDENTIFIER(:target_catalog || '.sales.nation')", + ].join("\n"); + const out = substituteParametersForDescribe(sql); + expect(out).toContain("IDENTIFIER('main' || '.sales.nation')"); + expect(out).not.toContain(":target_catalog"); + }); + + test("sample value wins over the type-based default", () => { + const sql = [ + "-- @param status STRING = active", + "SELECT * FROM t WHERE status = :status", + ].join("\n"); + expect(substituteParametersForDescribe(sql)).toContain("status = 'active'"); + }); + + test("falls back to the type default when no sample value is given", () => { + const sql = ["-- @param limit INT", "SELECT * FROM t LIMIT :limit"].join( + "\n", + ); + expect(substituteParametersForDescribe(sql)).toBe( + "-- @param limit INT\nSELECT * FROM t LIMIT 0", + ); + }); + + test("does not substitute placeholders inside string literals", () => { + const sql = "SELECT ':target_catalog' AS lit, :target_catalog AS p"; + const sql2 = ["-- @param target_catalog STRING = main", sql].join("\n"); + const out = substituteParametersForDescribe(sql2); + expect(out).toContain("':target_catalog' AS lit"); + expect(out).toContain("'main' AS p"); + }); +}); From 9fe37b704264ba8e82ad5093ecaa689d032ea9af Mon Sep 17 00:00:00 2001 From: MarioCadenas Date: Thu, 11 Jun 2026 14:59:00 +0200 Subject: [PATCH 2/4] fix(type-generator): validate describe-time sample values to prevent SQL injection (#383) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the IDENTIFIER sample-value support. `formatSampleValue` previously passed numeric/boolean/binary values through verbatim and returned already-quoted strings unescaped, so a `.sql` annotation (which may ship via a shared template or dependency) could inject arbitrary SQL into the build-time `DESCRIBE QUERY` — e.g. `-- @param n INT = 0) UNION SELECT secret FROM creds --`. Validate per type and reject anything that doesn't fit: - string/DATE/TIMESTAMP values are always emitted as one well-formed, fully escaped single-quoted literal (a pre-quoted literal is kept; anything else is escaped), so the value can't break out of the string - numeric types must match a plain-number shape, BOOLEAN must be true/false, BINARY must match X'..' — otherwise formatSampleValue returns null - extractParameterDefaults drops rejected values, so the parameter safely falls back to its type-based placeholder during DESCRIBE Adds tests for numeric/boolean/binary rejection and string neutralization, and documents the validation rules. Signed-off-by: MarioCadenas --- docs/docs/plugins/analytics.md | 10 ++- .../src/type-generator/query-registry.ts | 63 ++++++++++++++++--- .../tests/query-registry.test.ts | 51 +++++++++++++++ 3 files changed, 111 insertions(+), 13 deletions(-) diff --git a/docs/docs/plugins/analytics.md b/docs/docs/plugins/analytics.md index a1837f3b6..1ed102107 100644 --- a/docs/docs/plugins/analytics.md +++ b/docs/docs/plugins/analytics.md @@ -81,9 +81,13 @@ FROM IDENTIFIER(:target_catalog || '.sales.nation') Type generation describes `main.sales.nation` to infer the result columns, while the deployed app binds whatever catalog the caller passes. String, `DATE`, and -`TIMESTAMP` values are quoted automatically (`= main` → `'main'`); numeric and -boolean values are used verbatim (`= 100`, `= true`). Already-quoted values are -left as-is. +`TIMESTAMP` values are quoted automatically (`= main` → `'main'`), and an +already-quoted literal is kept as-is (`= '2024-01-01'`). Numeric, `BOOLEAN`, and +`BINARY` values are validated against a strict literal shape (`= 100`, `= true`, +`= X'00'`); a value that doesn't match — anything that could otherwise inject SQL +into the describe statement — is ignored and the parameter falls back to its +type-based placeholder, so a sample value can never break out of the +`DESCRIBE QUERY`. ## Server-injected parameters diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index c3801a274..2573155ff 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -394,23 +394,61 @@ export function defaultForType(sqlType: string | undefined): string { } /** - * Format a user-supplied sample value as a SQL literal for substitution. - * String-like types are wrapped in single quotes (with `'` doubled) unless the - * value is already a quoted literal; numeric/boolean/binary values pass through - * verbatim so callers can write `= 5`, `= true`, or `= X'00'` directly. + * True when `raw` is already a single, well-formed SQL single-quoted string + * literal — i.e. it opens and closes with `'` and every interior quote is part + * of an escaped `''` pair. `'2024-01-01'` and `'O''Brien'` qualify; + * `'a' OR 1=1 OR 'b'` does not (it has lone interior quotes), so it is treated + * as raw content and re-escaped rather than trusted. */ -function formatSampleValue(sqlType: string | undefined, raw: string): string { - if (raw.startsWith("'") && raw.endsWith("'") && raw.length >= 2) { - return raw; +function isWellFormedStringLiteral(raw: string): boolean { + if (raw.length < 2 || !raw.startsWith("'") || !raw.endsWith("'")) { + return false; } + const inner = raw.slice(1, -1); + return !inner.replace(/''/g, "").includes("'"); +} + +/** + * Format a user-supplied sample value as a SQL literal for substitution into + * the build-time DESCRIBE statement. Returns `null` when the value isn't valid + * for its type, so the caller falls back to the safe type-based placeholder + * instead of substituting attacker-controllable text. + * + * The value comes from a `.sql` file that may be shared via a template or + * dependency, so it must not be able to inject SQL into `DESCRIBE QUERY`: + * - string-like types are always emitted as one well-formed, fully-escaped + * single-quoted literal (a pre-quoted literal is kept as-is, anything else is + * quoted with `'` doubled), so the value can never break out of the string; + * - numeric / boolean / binary values must match a strict literal shape and are + * rejected (`null`) otherwise, rather than being passed through verbatim. + */ +function formatSampleValue( + sqlType: string | undefined, + raw: string, +): string | null { switch (sqlType?.toUpperCase()) { case "STRING": case "DATE": case "TIMESTAMP": case "TIMESTAMP_NTZ": - return `'${raw.replace(/'/g, "''")}'`; + return isWellFormedStringLiteral(raw) + ? raw + : `'${raw.replace(/'/g, "''")}'`; + case "NUMERIC": + case "DECIMAL": + case "BIGINT": + case "TINYINT": + case "SMALLINT": + case "INT": + case "FLOAT": + case "DOUBLE": + return /^[+-]?\d+(\.\d+)?$/.test(raw) ? raw : null; + case "BOOLEAN": + return /^(?:true|false)$/i.test(raw) ? raw.toLowerCase() : null; + case "BINARY": + return /^X'[0-9a-fA-F]*'$/i.test(raw) ? raw : null; default: - return raw; + return null; } } @@ -434,7 +472,12 @@ export function extractParameterDefaults(sql: string): Record { /--\s*@param\s+(\w+)\s+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)\s*=\s*(.+?)\s*$/gim; for (const match of sql.matchAll(regex)) { const [, paramName, paramType, rawValue] = match; - defaults[paramName] = formatSampleValue(paramType, rawValue); + const formatted = formatSampleValue(paramType, rawValue); + // A value that fails type validation is dropped, not substituted: the param + // then falls back to the safe type-based placeholder during DESCRIBE. + if (formatted !== null) { + defaults[paramName] = formatted; + } } return defaults; } diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index e961c1f57..3858ebf44 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -642,6 +642,43 @@ describe("extractParameterDefaults", () => { const sql = "-- @param onlyType STRING\nSELECT 1"; expect(extractParameterDefaults(sql)).toEqual({}); }); + + test("rejects numeric sample values that aren't plain numbers (no injection)", () => { + const sql = [ + "-- @param n INT = 0) UNION SELECT secret FROM creds --", + "-- @param r DOUBLE = 1; DROP TABLE x", + "SELECT 1", + ].join("\n"); + // Both fail strict numeric validation and are dropped (not substituted), + // so they fall back to the safe type default during DESCRIBE. + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("rejects non-boolean BOOLEAN sample values", () => { + const sql = "-- @param flag BOOLEAN = 1 OR 1=1\nSELECT 1"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("accepts well-formed BINARY and rejects malformed BINARY", () => { + expect( + extractParameterDefaults("-- @param b BINARY = X'00'\nSELECT 1"), + ).toEqual({ b: "X'00'" }); + expect( + extractParameterDefaults("-- @param b BINARY = X'00' OR 1=1\nSELECT 1"), + ).toEqual({}); + }); + + test("neutralizes a string value that isn't a well-formed literal by escaping it", () => { + const sql = ["-- @param x STRING = 'a' OR 1=1 OR 'b'", "SELECT 1"].join( + "\n", + ); + // The lone interior quotes mean this isn't a single well-formed literal, so + // it's treated as raw content and fully escaped — one inert string literal, + // no SQL break-out. + expect(extractParameterDefaults(sql)).toEqual({ + x: "'''a'' OR 1=1 OR ''b'''", + }); + }); }); describe("substituteParametersForDescribe (IDENTIFIER support, #383)", () => { @@ -689,4 +726,18 @@ describe("substituteParametersForDescribe (IDENTIFIER support, #383)", () => { expect(out).toContain("':target_catalog' AS lit"); expect(out).toContain("'main' AS p"); }); + + test("rejects an injection attempt in a numeric param, falling back to the placeholder", () => { + const sql = [ + "-- @param n INT = 1); DROP TABLE x --", + "SELECT * FROM t LIMIT :n", + ].join("\n"); + const out = substituteParametersForDescribe(sql); + // The malicious value is dropped; :n falls back to the INT default 0, so the + // payload never reaches the executable LIMIT clause. (It survives only in the + // @param comment line, which is inert.) + expect(out).toContain("SELECT * FROM t LIMIT 0"); + expect(out).not.toContain(":n"); + expect(out).not.toContain("LIMIT 1)"); + }); }); From 0ddcda52515e75022d10166edba982b9e0098333 Mon Sep 17 00:00:00 2001 From: MarioCadenas Date: Wed, 24 Jun 2026 12:30:59 +0200 Subject: [PATCH 3/4] fix(type-generator): close two gaps in describe-time sample values (#383) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the IDENTIFIER sample-value support, fixing two issues found in review. 1. SQL injection via backslash (security). formatSampleValue escaped only single quotes, but Databricks/Spark treats backslash as a string-literal escape by default, so a value like `x\' UNION SELECT secret FROM creds --` broke out of the literal during the build-time DESCRIBE QUERY. Now escape `\` before `'`, and reject backslash-bearing pre-quoted literals in isWellFormedStringLiteral (e.g. the unterminated `'a\'`). Verified end-to-end against a warehouse: the old escaping fails with TABLE_OR_VIEW_NOT_FOUND(creds); the fix describes it as a single string column. 2. Value-less @param line swallowed the next line (correctness). The extraction regex used `\s` around `=`, which matches newlines, so `-- @param x STRING =` (no value) captured the following line — or the next @param annotation — as its sample value. Switched the inter-token whitespace to horizontal-only (`[^\S\r\n]`) so such a line no longer matches and falls back to the type placeholder. Adds tests for both (backslash neutralization, blank-value line). Signed-off-by: MarioCadenas --- .../src/type-generator/query-registry.ts | 24 ++++++++---- .../tests/query-registry.test.ts | 38 +++++++++++++++++++ 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index 2573155ff..68e8beee4 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -398,13 +398,16 @@ export function defaultForType(sqlType: string | undefined): string { * literal — i.e. it opens and closes with `'` and every interior quote is part * of an escaped `''` pair. `'2024-01-01'` and `'O''Brien'` qualify; * `'a' OR 1=1 OR 'b'` does not (it has lone interior quotes), so it is treated - * as raw content and re-escaped rather than trusted. + * as raw content and re-escaped rather than trusted. A backslash also + * disqualifies it: Databricks/Spark treats `\` as an escape inside literals (so + * `'a\'` is unterminated), and we never trust such input — it is re-escaped. */ function isWellFormedStringLiteral(raw: string): boolean { if (raw.length < 2 || !raw.startsWith("'") || !raw.endsWith("'")) { return false; } const inner = raw.slice(1, -1); + if (inner.includes("\\")) return false; return !inner.replace(/''/g, "").includes("'"); } @@ -418,7 +421,8 @@ function isWellFormedStringLiteral(raw: string): boolean { * dependency, so it must not be able to inject SQL into `DESCRIBE QUERY`: * - string-like types are always emitted as one well-formed, fully-escaped * single-quoted literal (a pre-quoted literal is kept as-is, anything else is - * quoted with `'` doubled), so the value can never break out of the string; + * quoted with both `\` and `'` doubled so neither can terminate the literal), + * so the value can never break out of the string; * - numeric / boolean / binary values must match a strict literal shape and are * rejected (`null`) otherwise, rather than being passed through verbatim. */ @@ -433,7 +437,7 @@ function formatSampleValue( case "TIMESTAMP_NTZ": return isWellFormedStringLiteral(raw) ? raw - : `'${raw.replace(/'/g, "''")}'`; + : `'${raw.replace(/\\/g, "\\\\").replace(/'/g, "''")}'`; case "NUMERIC": case "DECIMAL": case "BIGINT": @@ -467,9 +471,13 @@ function formatSampleValue( export function extractParameterDefaults(sql: string): Record { const defaults: Record = {}; // Mirrors extractParameterTypes' type alternation, then requires `= ` - // through end-of-line. Lines without a value are left to extractParameterTypes. + // on the same line. All inter-token whitespace is horizontal-only + // (`[^\S\r\n]`, not `\s`): `\s` matches newlines, so a value-less line like + // `-- @param x STRING =` would let `\s*=\s*(.+?)` swallow the *next* line as + // the sample value. Restricting to same-line whitespace makes such a line + // simply not match, so it correctly falls back to the type placeholder. const regex = - /--\s*@param\s+(\w+)\s+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)\s*=\s*(.+?)\s*$/gim; + /--[^\S\r\n]*@param[^\S\r\n]+(\w+)[^\S\r\n]+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)[^\S\r\n]*=[^\S\r\n]*(.+?)[^\S\r\n]*$/gim; for (const match of sql.matchAll(regex)) { const [, paramName, paramType, rawValue] = match; const formatted = formatSampleValue(paramType, rawValue); @@ -665,9 +673,9 @@ export async function generateQueriesFromDescribe( const annotatedTypes = extractParameterTypes(sql); const inferredTypes = inferParameterTypes(sql, protectedRanges); const parameterTypes = { ...inferredTypes, ...annotatedTypes }; - // Explicit describe-time sample values (`-- @param name TYPE = value`) - // take precedence over the type-based default so queries with dynamic - // table names (IDENTIFIER) can resolve a real table during DESCRIBE. + // substituteParametersForDescribe applies `-- @param name TYPE = value` + // sample values (e.g. for IDENTIFIER table names) ahead of type defaults; + // parameterDefaults is recomputed here only to skip them in the warn loop. const parameterDefaults = extractParameterDefaults(sql); const sqlWithDefaults = substituteParametersForDescribe(sql); diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index 3858ebf44..1e37c8020 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -679,6 +679,44 @@ describe("extractParameterDefaults", () => { x: "'''a'' OR 1=1 OR ''b'''", }); }); + + test("escapes backslashes so a string value can't break out via \\'", () => { + // Databricks/Spark treats `\` as an escape inside string literals, so + // doubling only `'` is not enough: `x\' UNION ...` quoted as `'x\'' ...'` + // would let `\'` escape the first quote and the next `'` close the literal, + // turning the rest into executable SQL. Escaping `\` -> `\\` first keeps the + // value as one inert literal. + const sql = [ + "-- @param x STRING = x\\' UNION SELECT secret FROM creds --", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + x: "'x\\\\'' UNION SELECT secret FROM creds --'", + }); + // A backslash-bearing "looks pre-quoted" value is not trusted either: it is + // re-escaped rather than passed through (an unterminated `'a\'`). + expect( + extractParameterDefaults("-- @param y STRING = 'a\\'\nSELECT 1"), + ).toEqual({ y: "'''a\\\\'''" }); + }); + + test("a value-less `=` line does not swallow the following line", () => { + // `\s` would match the newline and capture the next line as the value; the + // horizontal-only whitespace class makes this line simply not match, so the + // param falls back to its type placeholder during DESCRIBE. + const blankValue = ["-- @param target_catalog STRING =", "SELECT 1"].join( + "\n", + ); + expect(extractParameterDefaults(blankValue)).toEqual({}); + + // ...and it must not consume the *next* @param annotation either. + const blankThenNext = [ + "-- @param a STRING =", + "-- @param b INT = 5", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(blankThenNext)).toEqual({ b: "5" }); + }); }); describe("substituteParametersForDescribe (IDENTIFIER support, #383)", () => { From 81ed18203e7d7c264c95c1ae7bb06cdc3f91e542 Mon Sep 17 00:00:00 2001 From: MarioCadenas Date: Wed, 24 Jun 2026 12:44:37 +0200 Subject: [PATCH 4/4] refactor(type-generator): dedupe @param type alternation (#383) extractParameterTypes and extractParameterDefaults each hardcoded the same STRING|NUMERIC|...|BINARY alternation, kept in sync by hand. Extract it to a single PARAM_TYPE_ALTERNATION constant built into both regexes via RegExp so the two can't drift. No behavior change. Signed-off-by: MarioCadenas --- .../src/type-generator/query-registry.ts | 33 ++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index 68e8beee4..236da2ae7 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -349,12 +349,19 @@ function degradedType( : generateUnknownResultQuery(sql, queryName); } +// Single source of truth for the `@param` type alternation, shared by +// extractParameterTypes and extractParameterDefaults so the two can't drift. +// Alternation order matters: TIMESTAMP_NTZ must precede TIMESTAMP so the regex +// engine doesn't greedy-match TIMESTAMP and leave `_NTZ` unconsumed. +const PARAM_TYPE_ALTERNATION = + "STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY"; + export function extractParameterTypes(sql: string): Record { const paramTypes: Record = {}; - // Alternation order matters: TIMESTAMP_NTZ must precede TIMESTAMP so the - // regex engine doesn't greedy-match TIMESTAMP and leave `_NTZ` unconsumed. - const regex = - /--\s*@param\s+(\w+)\s+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)\b/gi; + const regex = new RegExp( + `--\\s*@param\\s+(\\w+)\\s+(${PARAM_TYPE_ALTERNATION})\\b`, + "gi", + ); const matches = sql.matchAll(regex); for (const match of matches) { const [, paramName, paramType] = match; @@ -470,14 +477,16 @@ function formatSampleValue( */ export function extractParameterDefaults(sql: string): Record { const defaults: Record = {}; - // Mirrors extractParameterTypes' type alternation, then requires `= ` - // on the same line. All inter-token whitespace is horizontal-only - // (`[^\S\r\n]`, not `\s`): `\s` matches newlines, so a value-less line like - // `-- @param x STRING =` would let `\s*=\s*(.+?)` swallow the *next* line as - // the sample value. Restricting to same-line whitespace makes such a line - // simply not match, so it correctly falls back to the type placeholder. - const regex = - /--[^\S\r\n]*@param[^\S\r\n]+(\w+)[^\S\r\n]+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)[^\S\r\n]*=[^\S\r\n]*(.+?)[^\S\r\n]*$/gim; + // Reuses PARAM_TYPE_ALTERNATION, then requires `= ` on the same line. + // All inter-token whitespace is horizontal-only (`[^\S\r\n]`, not `\s`): `\s` + // matches newlines, so a value-less line like `-- @param x STRING =` would let + // `\s*=\s*(.+?)` swallow the *next* line as the sample value. Restricting to + // same-line whitespace makes such a line simply not match, so it correctly + // falls back to the type placeholder. + const regex = new RegExp( + `--[^\\S\\r\\n]*@param[^\\S\\r\\n]+(\\w+)[^\\S\\r\\n]+(${PARAM_TYPE_ALTERNATION})[^\\S\\r\\n]*=[^\\S\\r\\n]*(.+?)[^\\S\\r\\n]*$`, + "gim", + ); for (const match of sql.matchAll(regex)) { const [, paramName, paramType, rawValue] = match; const formatted = formatSampleValue(paramType, rawValue);