diff --git a/docs/docs/development/type-generation.md b/docs/docs/development/type-generation.md index c715791a9..48d239ff9 100644 --- a/docs/docs/development/type-generation.md +++ b/docs/docs/development/type-generation.md @@ -94,6 +94,17 @@ The type generator: 4. Generates TypeScript interfaces for query parameters and results 5. Creates a `QueryRegistry` type for type-safe query execution +### Parameters during `DESCRIBE QUERY` + +Type generation describes each query without binding real parameters, so it +substitutes a placeholder default for every `:param` (e.g. `''` for a string). +That breaks queries whose shape depends on a value — most notably dynamic table +names via `IDENTIFIER(:catalog || '.schema.table')`. Annotate such parameters +with a sample value (`-- @param catalog STRING = main`) so the describe call can +resolve a real table. The sample value is used only at type-generation time; the +runtime query still binds the actual parameter. See +[SQL parameters → Sample values](../plugins/analytics.md#sample-values-for-type-generation). + ## Using generated types Once types are generated, your IDE will provide autocomplete and type checking: diff --git a/docs/docs/plugins/analytics.md b/docs/docs/plugins/analytics.md index 3198d4340..1ed102107 100644 --- a/docs/docs/plugins/analytics.md +++ b/docs/docs/plugins/analytics.md @@ -61,6 +61,34 @@ at the call site. - `FLOAT`, `DOUBLE` — bind via `sql.float()` / `sql.double()` - `NUMERIC`, `DECIMAL` — bind via `sql.numeric()` (pass strings for precision) +### Sample values for type generation + +Some queries only have a valid shape once a parameter has a concrete value — most +commonly a dynamic table name built with `IDENTIFIER()`. During type generation +AppKit runs `DESCRIBE QUERY` with placeholder defaults, so an unresolved parameter +collapses to an empty string and produces invalid SQL +(`IDENTIFIER('' || '.schema.table')` → `PARSE_SYNTAX_ERROR`). + +Append `= value` to a `-- @param` annotation to give type generation a sample +value. It is used **only** while describing the query; at runtime the real +parameter is still bound, so the query stays portable across environments: + +```sql +-- @param target_catalog STRING = main +SELECT * +FROM IDENTIFIER(:target_catalog || '.sales.nation') +``` + +Type generation describes `main.sales.nation` to infer the result columns, while +the deployed app binds whatever catalog the caller passes. String, `DATE`, and +`TIMESTAMP` values are quoted automatically (`= main` → `'main'`), and an +already-quoted literal is kept as-is (`= '2024-01-01'`). Numeric, `BOOLEAN`, and +`BINARY` values are validated against a strict literal shape (`= 100`, `= true`, +`= X'00'`); a value that doesn't match — anything that could otherwise inject SQL +into the describe statement — is ignored and the parameter falls back to its +type-based placeholder, so a sample value can never break out of the +`DESCRIBE QUERY`. + ## Server-injected parameters `:workspaceId` is **injected by the server** and **must not** be annotated: diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index ce0cdd0b7..236da2ae7 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -349,12 +349,19 @@ function degradedType( : generateUnknownResultQuery(sql, queryName); } +// Single source of truth for the `@param` type alternation, shared by +// extractParameterTypes and extractParameterDefaults so the two can't drift. +// Alternation order matters: TIMESTAMP_NTZ must precede TIMESTAMP so the regex +// engine doesn't greedy-match TIMESTAMP and leave `_NTZ` unconsumed. +const PARAM_TYPE_ALTERNATION = + "STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY"; + export function extractParameterTypes(sql: string): Record { const paramTypes: Record = {}; - // Alternation order matters: TIMESTAMP_NTZ must precede TIMESTAMP so the - // regex engine doesn't greedy-match TIMESTAMP and leave `_NTZ` unconsumed. - const regex = - /--\s*@param\s+(\w+)\s+(STRING|NUMERIC|DECIMAL|BIGINT|TINYINT|SMALLINT|INT|FLOAT|DOUBLE|BOOLEAN|DATE|TIMESTAMP_NTZ|TIMESTAMP|BINARY)\b/gi; + const regex = new RegExp( + `--\\s*@param\\s+(\\w+)\\s+(${PARAM_TYPE_ALTERNATION})\\b`, + "gi", + ); const matches = sql.matchAll(regex); for (const match of matches) { const [, paramName, paramType] = match; @@ -393,6 +400,169 @@ export function defaultForType(sqlType: string | undefined): string { } } +/** + * True when `raw` is already a single, well-formed SQL single-quoted string + * literal — i.e. it opens and closes with `'` and every interior quote is part + * of an escaped `''` pair. `'2024-01-01'` and `'O''Brien'` qualify; + * `'a' OR 1=1 OR 'b'` does not (it has lone interior quotes), so it is treated + * as raw content and re-escaped rather than trusted. A backslash also + * disqualifies it: Databricks/Spark treats `\` as an escape inside literals (so + * `'a\'` is unterminated), and we never trust such input — it is re-escaped. + */ +function isWellFormedStringLiteral(raw: string): boolean { + if (raw.length < 2 || !raw.startsWith("'") || !raw.endsWith("'")) { + return false; + } + const inner = raw.slice(1, -1); + if (inner.includes("\\")) return false; + return !inner.replace(/''/g, "").includes("'"); +} + +/** + * Format a user-supplied sample value as a SQL literal for substitution into + * the build-time DESCRIBE statement. Returns `null` when the value isn't valid + * for its type, so the caller falls back to the safe type-based placeholder + * instead of substituting attacker-controllable text. + * + * The value comes from a `.sql` file that may be shared via a template or + * dependency, so it must not be able to inject SQL into `DESCRIBE QUERY`: + * - string-like types are always emitted as one well-formed, fully-escaped + * single-quoted literal (a pre-quoted literal is kept as-is, anything else is + * quoted with both `\` and `'` doubled so neither can terminate the literal), + * so the value can never break out of the string; + * - numeric / boolean / binary values must match a strict literal shape and are + * rejected (`null`) otherwise, rather than being passed through verbatim. + */ +function formatSampleValue( + sqlType: string | undefined, + raw: string, +): string | null { + switch (sqlType?.toUpperCase()) { + case "STRING": + case "DATE": + case "TIMESTAMP": + case "TIMESTAMP_NTZ": + return isWellFormedStringLiteral(raw) + ? raw + : `'${raw.replace(/\\/g, "\\\\").replace(/'/g, "''")}'`; + case "NUMERIC": + case "DECIMAL": + case "BIGINT": + case "TINYINT": + case "SMALLINT": + case "INT": + case "FLOAT": + case "DOUBLE": + return /^[+-]?\d+(\.\d+)?$/.test(raw) ? raw : null; + case "BOOLEAN": + return /^(?:true|false)$/i.test(raw) ? raw.toLowerCase() : null; + case "BINARY": + return /^X'[0-9a-fA-F]*'$/i.test(raw) ? raw : null; + default: + return null; + } +} + +/** + * Parse optional describe-time sample values from `@param` annotations, e.g. + * `-- @param target_catalog STRING = main`. The value is substituted into the + * SQL **only during DESCRIBE QUERY** so type generation can resolve queries + * whose shape depends on a parameter value — most notably dynamic table names + * via `IDENTIFIER(:target_catalog || '.schema.table')`, where the empty-string + * default would otherwise produce malformed SQL. Runtime binding is unaffected: + * the analytics plugin still binds the real parameter at execution time, so the + * query stays portable across environments. + * + * Returns a map of parameter name to the formatted SQL literal to substitute. + */ +export function extractParameterDefaults(sql: string): Record { + const defaults: Record = {}; + // Reuses PARAM_TYPE_ALTERNATION, then requires `= ` on the same line. + // All inter-token whitespace is horizontal-only (`[^\S\r\n]`, not `\s`): `\s` + // matches newlines, so a value-less line like `-- @param x STRING =` would let + // `\s*=\s*(.+?)` swallow the *next* line as the sample value. Restricting to + // same-line whitespace makes such a line simply not match, so it correctly + // falls back to the type placeholder. + const regex = new RegExp( + `--[^\\S\\r\\n]*@param[^\\S\\r\\n]+(\\w+)[^\\S\\r\\n]+(${PARAM_TYPE_ALTERNATION})[^\\S\\r\\n]*=[^\\S\\r\\n]*(.+?)[^\\S\\r\\n]*$`, + "gim", + ); + for (const match of sql.matchAll(regex)) { + const [, paramName, paramType, rawValue] = match; + const formatted = formatSampleValue(paramType, rawValue); + // A value that fails type validation is dropped, not substituted: the param + // then falls back to the safe type-based placeholder during DESCRIBE. + if (formatted !== null) { + defaults[paramName] = formatted; + } + } + return defaults; +} + +/** + * Replace `:param` placeholders with describe-time literals so `DESCRIBE QUERY` + * can run without bound parameters. Resolution order per parameter: + * 1. An explicit `-- @param name TYPE = value` sample value (wins), which lets + * dynamic table names via `IDENTIFIER(...)` resolve to a real table. + * 2. Otherwise a placeholder default derived from the annotated/inferred type. + * Placeholders inside string literals or comments are left untouched. + */ +export function substituteParametersForDescribe(sql: string): string { + const protectedRanges = getProtectedRanges(sql); + const annotatedTypes = extractParameterTypes(sql); + const inferredTypes = inferParameterTypes(sql, protectedRanges); + const parameterTypes = { ...inferredTypes, ...annotatedTypes }; + const parameterDefaults = extractParameterDefaults(sql); + return sql.replace( + /(? { + if (isInsideProtectedRange(offset, protectedRanges)) { + return original; + } + const sampleValue = parameterDefaults[paramName]; + if (sampleValue !== undefined) { + return sampleValue; + } + return defaultForType(parameterTypes[paramName]); + }, + ); +} + +/** + * Append a remediation hint when a DESCRIBE failure looks like a dynamic + * identifier that couldn't be resolved: the query calls `IDENTIFIER(...)` and + * has at least one parameter without a describe-time sample value. These fail + * because typegen substitutes a placeholder default (e.g. `''`) that yields a + * malformed or non-existent table name. Steering the user to the `= value` + * annotation turns the fatal error into a one-line fix. + */ +function withIdentifierHint( + error: { code?: string; message: string }, + sql: string, +): { code?: string; message: string } { + if (!/\bIDENTIFIER\s*\(/i.test(sql)) { + return error; + } + const protectedRanges = getProtectedRanges(sql); + const params = extractParameters(sql, protectedRanges); + const defaults = extractParameterDefaults(sql); + const unresolved = params.filter( + (p) => !SERVER_INJECTED_PARAMS.includes(p) && defaults[p] === undefined, + ); + if (unresolved.length === 0) { + return error; + } + const example = unresolved[0]; + return { + ...error, + message: `${error.message}\n Hint: this query uses IDENTIFIER() with parameter(s) ${unresolved + .map((p) => `:${p}`) + .join( + ", ", + )}. Give type generation a sample value so it can resolve the table, e.g. \`-- @param ${example} STRING = my_catalog\`. The runtime query still binds the real parameter.`, + }; +} + /** * Infer parameter types from positional context in SQL. * V1 only infers NUMERIC from patterns like LIMIT, OFFSET, TOP, @@ -512,20 +682,17 @@ export async function generateQueriesFromDescribe( const annotatedTypes = extractParameterTypes(sql); const inferredTypes = inferParameterTypes(sql, protectedRanges); const parameterTypes = { ...inferredTypes, ...annotatedTypes }; - const sqlWithDefaults = sql.replace( - /(? { - if (isInsideProtectedRange(offset, protectedRanges)) { - return original; - } - return defaultForType(parameterTypes[paramName]); - }, - ); + // substituteParametersForDescribe applies `-- @param name TYPE = value` + // sample values (e.g. for IDENTIFIER table names) ahead of type defaults; + // parameterDefaults is recomputed here only to skip them in the warn loop. + const parameterDefaults = extractParameterDefaults(sql); + const sqlWithDefaults = substituteParametersForDescribe(sql); // Warn about unresolved parameters const allParams = extractParameters(sql, protectedRanges); for (const param of allParams) { if (SERVER_INJECTED_PARAMS.includes(param)) continue; + if (parameterDefaults[param]) continue; if (parameterTypes[param]) continue; logger.warn( '%s: parameter ":%s" has no type annotation or inference. Add %s to the query file.', @@ -711,7 +878,7 @@ export async function generateQueriesFromDescribe( status: "syntax", index, schema: { name: queryName, type }, - error: parseError(sqlError), + error: withIdentifierHint(parseError(sqlError), sql), }; } diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index b149d5bbe..1e37c8020 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -2,12 +2,14 @@ import { describe, expect, test } from "vitest"; import { convertToQueryType, defaultForType, + extractParameterDefaults, extractParameters, extractParameterTypes, getProtectedRanges, inferParameterTypes, normalizeTypeName, SERVER_INJECTED_PARAMS, + substituteParametersForDescribe, } from "../query-registry"; import type { DatabricksStatementExecutionResponse } from "../types"; @@ -590,3 +592,190 @@ describe("substitution skips protected ranges", () => { expect(result).toContain("id = ''"); }); }); + +describe("extractParameterDefaults", () => { + test("returns empty object when no sample values are present", () => { + const sql = + "-- @param startDate DATE\nSELECT * FROM t WHERE d = :startDate"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("quotes string-like sample values", () => { + const sql = [ + "-- @param target_catalog STRING = main", + "-- @param since DATE = 2024-01-01", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + target_catalog: "'main'", + since: "'2024-01-01'", + }); + }); + + test("passes numeric and boolean sample values through verbatim", () => { + const sql = [ + "-- @param limit INT = 100", + "-- @param ratio DOUBLE = 0.5", + "-- @param active BOOLEAN = true", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + limit: "100", + ratio: "0.5", + active: "true", + }); + }); + + test("leaves already-quoted values untouched and escapes embedded quotes", () => { + const sql = [ + "-- @param a STRING = 'pre-quoted'", + "-- @param b STRING = O'Brien", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + a: "'pre-quoted'", + b: "'O''Brien'", + }); + }); + + test("ignores @param lines without a value", () => { + const sql = "-- @param onlyType STRING\nSELECT 1"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("rejects numeric sample values that aren't plain numbers (no injection)", () => { + const sql = [ + "-- @param n INT = 0) UNION SELECT secret FROM creds --", + "-- @param r DOUBLE = 1; DROP TABLE x", + "SELECT 1", + ].join("\n"); + // Both fail strict numeric validation and are dropped (not substituted), + // so they fall back to the safe type default during DESCRIBE. + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("rejects non-boolean BOOLEAN sample values", () => { + const sql = "-- @param flag BOOLEAN = 1 OR 1=1\nSELECT 1"; + expect(extractParameterDefaults(sql)).toEqual({}); + }); + + test("accepts well-formed BINARY and rejects malformed BINARY", () => { + expect( + extractParameterDefaults("-- @param b BINARY = X'00'\nSELECT 1"), + ).toEqual({ b: "X'00'" }); + expect( + extractParameterDefaults("-- @param b BINARY = X'00' OR 1=1\nSELECT 1"), + ).toEqual({}); + }); + + test("neutralizes a string value that isn't a well-formed literal by escaping it", () => { + const sql = ["-- @param x STRING = 'a' OR 1=1 OR 'b'", "SELECT 1"].join( + "\n", + ); + // The lone interior quotes mean this isn't a single well-formed literal, so + // it's treated as raw content and fully escaped — one inert string literal, + // no SQL break-out. + expect(extractParameterDefaults(sql)).toEqual({ + x: "'''a'' OR 1=1 OR ''b'''", + }); + }); + + test("escapes backslashes so a string value can't break out via \\'", () => { + // Databricks/Spark treats `\` as an escape inside string literals, so + // doubling only `'` is not enough: `x\' UNION ...` quoted as `'x\'' ...'` + // would let `\'` escape the first quote and the next `'` close the literal, + // turning the rest into executable SQL. Escaping `\` -> `\\` first keeps the + // value as one inert literal. + const sql = [ + "-- @param x STRING = x\\' UNION SELECT secret FROM creds --", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(sql)).toEqual({ + x: "'x\\\\'' UNION SELECT secret FROM creds --'", + }); + // A backslash-bearing "looks pre-quoted" value is not trusted either: it is + // re-escaped rather than passed through (an unterminated `'a\'`). + expect( + extractParameterDefaults("-- @param y STRING = 'a\\'\nSELECT 1"), + ).toEqual({ y: "'''a\\\\'''" }); + }); + + test("a value-less `=` line does not swallow the following line", () => { + // `\s` would match the newline and capture the next line as the value; the + // horizontal-only whitespace class makes this line simply not match, so the + // param falls back to its type placeholder during DESCRIBE. + const blankValue = ["-- @param target_catalog STRING =", "SELECT 1"].join( + "\n", + ); + expect(extractParameterDefaults(blankValue)).toEqual({}); + + // ...and it must not consume the *next* @param annotation either. + const blankThenNext = [ + "-- @param a STRING =", + "-- @param b INT = 5", + "SELECT 1", + ].join("\n"); + expect(extractParameterDefaults(blankThenNext)).toEqual({ b: "5" }); + }); +}); + +describe("substituteParametersForDescribe (IDENTIFIER support, #383)", () => { + test("empty-string default produces malformed SQL for IDENTIFIER params", () => { + // Reproduces the bug: with no sample value, an untyped IDENTIFIER param + // collapses to '' and yields IDENTIFIER('' || '.schema.table'), a leading-dot + // identifier that Databricks rejects with PARSE_SYNTAX_ERROR. + const sql = "SELECT * FROM IDENTIFIER(:target_catalog || '.sales.nation')"; + expect(substituteParametersForDescribe(sql)).toBe( + "SELECT * FROM IDENTIFIER('' || '.sales.nation')", + ); + }); + + test("sample value resolves IDENTIFIER to a real, describable table", () => { + const sql = [ + "-- @param target_catalog STRING = main", + "SELECT * FROM IDENTIFIER(:target_catalog || '.sales.nation')", + ].join("\n"); + const out = substituteParametersForDescribe(sql); + expect(out).toContain("IDENTIFIER('main' || '.sales.nation')"); + expect(out).not.toContain(":target_catalog"); + }); + + test("sample value wins over the type-based default", () => { + const sql = [ + "-- @param status STRING = active", + "SELECT * FROM t WHERE status = :status", + ].join("\n"); + expect(substituteParametersForDescribe(sql)).toContain("status = 'active'"); + }); + + test("falls back to the type default when no sample value is given", () => { + const sql = ["-- @param limit INT", "SELECT * FROM t LIMIT :limit"].join( + "\n", + ); + expect(substituteParametersForDescribe(sql)).toBe( + "-- @param limit INT\nSELECT * FROM t LIMIT 0", + ); + }); + + test("does not substitute placeholders inside string literals", () => { + const sql = "SELECT ':target_catalog' AS lit, :target_catalog AS p"; + const sql2 = ["-- @param target_catalog STRING = main", sql].join("\n"); + const out = substituteParametersForDescribe(sql2); + expect(out).toContain("':target_catalog' AS lit"); + expect(out).toContain("'main' AS p"); + }); + + test("rejects an injection attempt in a numeric param, falling back to the placeholder", () => { + const sql = [ + "-- @param n INT = 1); DROP TABLE x --", + "SELECT * FROM t LIMIT :n", + ].join("\n"); + const out = substituteParametersForDescribe(sql); + // The malicious value is dropped; :n falls back to the INT default 0, so the + // payload never reaches the executable LIMIT clause. (It survives only in the + // @param comment line, which is inert.) + expect(out).toContain("SELECT * FROM t LIMIT 0"); + expect(out).not.toContain(":n"); + expect(out).not.toContain("LIMIT 1)"); + }); +});