-
Notifications
You must be signed in to change notification settings - Fork 1.3k
fix: More exhaustive athena types #6544
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2198,25 +2198,62 @@ def pg_type_code_to_arrow(code: int) -> str: | |
|
|
||
| def athena_to_feast_value_type(athena_type_as_str: str) -> ValueType: | ||
| # Type names from https://docs.aws.amazon.com/athena/latest/ug/data-types.html | ||
| athena_type = athena_type_as_str.lower().strip() | ||
| if athena_type.startswith("array"): | ||
| inner_type_match = re.search(r"(?:<|\[)(.+)(?:>|\])", athena_type) | ||
| if inner_type_match: | ||
| inner_type = inner_type_match.group(1).strip() | ||
| inner_feast_type = athena_to_feast_value_type(inner_type) | ||
|
|
||
| list_mapping = { | ||
| ValueType.BYTES: ValueType.BYTES_LIST, | ||
| ValueType.STRING: ValueType.STRING_LIST, | ||
| ValueType.INT32: ValueType.INT32_LIST, | ||
| ValueType.INT64: ValueType.INT64_LIST, | ||
| ValueType.DOUBLE: ValueType.DOUBLE_LIST, | ||
| ValueType.FLOAT: ValueType.FLOAT_LIST, | ||
| ValueType.BOOL: ValueType.BOOL_LIST, | ||
| ValueType.UNIX_TIMESTAMP: ValueType.UNIX_TIMESTAMP_LIST, | ||
| ValueType.MAP: ValueType.MAP_LIST, | ||
| ValueType.JSON: ValueType.JSON_LIST, | ||
| ValueType.STRUCT: ValueType.STRUCT_LIST, | ||
| ValueType.UUID: ValueType.UUID_LIST, | ||
| ValueType.DECIMAL: ValueType.DECIMAL_LIST, | ||
| } | ||
| return list_mapping.get(inner_feast_type, ValueType.VALUE_LIST) | ||
| return ValueType.VALUE_LIST | ||
|
|
||
| base_type = re.split(r"[(<\[]", athena_type)[0].strip() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When Athena returns a map type with a non-string key, e.g. Useful? React with 👍 / 👎. |
||
|
|
||
| if "timestamp" in base_type or "time" in base_type or "date" in base_type: | ||
| return ValueType.UNIX_TIMESTAMP | ||
|
|
||
| type_map = { | ||
| "null": ValueType.UNKNOWN, | ||
| "null": ValueType.UNKNOWN, # There is a null type, but this preserves backwards compat | ||
| "boolean": ValueType.BOOL, | ||
| "tinyint": ValueType.INT32, | ||
| "smallint": ValueType.INT32, | ||
| "int": ValueType.INT32, | ||
| "integer": ValueType.INT32, | ||
| "bigint": ValueType.INT64, | ||
| "double": ValueType.DOUBLE, | ||
| "float": ValueType.FLOAT, | ||
| "real": ValueType.FLOAT, | ||
| "decimal": ValueType.DECIMAL, | ||
| "binary": ValueType.BYTES, | ||
| "varbinary": ValueType.BYTES, | ||
| "char": ValueType.STRING, | ||
| "varchar": ValueType.STRING, | ||
| "string": ValueType.STRING, | ||
| "timestamp": ValueType.UNIX_TIMESTAMP, | ||
| "json": ValueType.JSON, | ||
| "struct": ValueType.STRUCT, | ||
| "row": ValueType.STRUCT, | ||
| "map": ValueType.MAP, | ||
| "uuid": ValueType.UUID, | ||
| "ipaddress": ValueType.STRING, | ||
| } | ||
| return type_map[athena_type_as_str.lower()] | ||
|
|
||
| return type_map.get(base_type, ValueType.UNKNOWN) | ||
|
|
||
|
|
||
| def pa_to_athena_value_type(pa_type: "pyarrow.DataType") -> str: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For Athena columns like
ARRAY<ARRAY<INT>>, the recursive call returnsValueType.INT32_LIST, which is not inlist_mapping, so this fallback classifies the column as genericVALUE_LIST. The generic path is later converted to placeholder string nested lists (Array(Array(String))infrom_value_type, and the Athena type string cannot be parsed by the PyArrow fallback), so feature inference/materialization will use the wrong schema for nested numeric, boolean, or timestamp arrays.Useful? React with 👍 / 👎.