Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 79 additions & 26 deletions checks/time_checks/check_time_range_vs_filename.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,37 @@
from netCDF4 import num2date


_TIME_RANGE_RE = re.compile(r"^(?P<start>\d{6}|\d{8})-(?P<end>\d{6}|\d{8})$")
# CMIP7 DRS time-range tokens by frequency. Lengths come from the
# CMIP7 DRS specification ("Date/Time element specifications"):
# yr / yrPt / dec -> YYYY (4)
# mon / monPt / monC -> YYYYMM (6)
# day / dayPt -> YYYYMMDD (8)
# 6hr / 6hrPt / 3hr… -> YYYYMMDDhh (10)
# 1hrCM / sub-daily Pt -> YYYYMMDDhhmm (12)
# subhrPt -> YYYYMMDDhhmmss (14)
# Pre-CMIP6 we only ever saw 6 or 8. Accept all valid lengths so the
# check applies cleanly to sub-daily CMIP7 files.
_TIME_RANGE_RE = re.compile(
r"^(?P<start>\d{4}|\d{6}|\d{8}|\d{10}|\d{12}|\d{14})"
r"-(?P<end>\d{4}|\d{6}|\d{8}|\d{10}|\d{12}|\d{14})"
r"(?:-clim)?$"
)


def _extract_time_range_from_filename(filename: str):
"""
Extract time range token from filename.
Works for CMIP6 and CMIP7:
..._YYYYMM-YYYYMM.nc
..._YYYYMMDD-YYYYMMDD.nc
Returns (start_str, end_str, use_day) or (None, None, None) if not found.
..._YYYY-YYYY.nc (yearly)
..._YYYYMM-YYYYMM.nc (monthly)
..._YYYYMMDD-YYYYMMDD.nc (daily)
..._YYYYMMDDhh-YYYYMMDDhh.nc (hourly point)
..._YYYYMMDDhhmm-YYYYMMDDhhmm.nc (sub-hourly point)
..._YYYYMMDDhhmmss-YYYYMMDDhhmmss.nc
The optional ``-clim`` suffix on climatology files is also accepted.
Returns (start_str, end_str, precision_len) or (None, None, None)
if not found. ``precision_len`` is the digit-count (4/6/8/10/12/14)
so callers can decide how granular the coverage comparison should be.
"""
stem = filename[:-3] if filename.endswith(".nc") else filename
last_token = stem.split("_")[-1]
Expand All @@ -25,27 +46,39 @@ def _extract_time_range_from_filename(filename: str):

start_str = m.group("start")
end_str = m.group("end")
use_day = (len(start_str) == 8)
return start_str, end_str, use_day
return start_str, end_str, len(start_str)


def _tuple_from_datestr(s: str):
"""YYYYMM or YYYYMMDD -> tuple comparable."""
if len(s) == 8:
return (int(s[:4]), int(s[4:6]), int(s[6:8]))
"""YYYY / YYYYMM / YYYYMMDD / YYYYMMDDhh / YYYYMMDDhhmm /
YYYYMMDDhhmmss -> tuple at the matching precision."""
if len(s) == 4:
return (int(s[:4]),)
if len(s) == 6:
return (int(s[:4]), int(s[4:6]))
if len(s) == 8:
return (int(s[:4]), int(s[4:6]), int(s[6:8]))
if len(s) == 10:
return (int(s[:4]), int(s[4:6]), int(s[6:8]), int(s[8:10]))
if len(s) == 12:
return (int(s[:4]), int(s[4:6]), int(s[6:8]), int(s[8:10]), int(s[10:12]))
if len(s) == 14:
return (
int(s[:4]), int(s[4:6]), int(s[6:8]),
int(s[8:10]), int(s[10:12]), int(s[12:14]),
)
raise ValueError(f"Unrecognized time range token: {s}")


def _coverage_from_time(ds):
"""
Prefer bounds if available:
time:bounds="time_bnds" and time_bnds(time, bnds)
Returns (start_tuple, end_tuple, use_day) where use_day indicates day precision.
Returns (start_tuple, end_tuple, err). Tuples are at second precision
(Y, M, D, h, m, s); callers truncate to the filename's precision.
"""
if "time" not in ds.variables:
return None, None, None, "Missing 'time' variable."
return None, None, "Missing 'time' variable."

tvar = ds.variables["time"]

Expand All @@ -65,8 +98,14 @@ def _coverage_from_time(ds):
start_dt = num2date(start_val, units=units, calendar=calendar)
end_dt = num2date(end_val, units=units, calendar=calendar)

return (start_dt.year, start_dt.month, start_dt.day), (end_dt.year, end_dt.month, end_dt.day), True, None
except Exception as e:
return (
(start_dt.year, start_dt.month, start_dt.day,
start_dt.hour, start_dt.minute, start_dt.second),
(end_dt.year, end_dt.month, end_dt.day,
end_dt.hour, end_dt.minute, end_dt.second),
None,
)
except Exception:
# fallback to time points if bounds conversion fails
pass

Expand All @@ -76,18 +115,30 @@ def _coverage_from_time(ds):
if hasattr(tvals, "compressed"):
tvals = tvals.compressed()
if tvals.size == 0:
return None, None, None, "The 'time' variable is empty."
return None, None, "The 'time' variable is empty."

units = tvar.units
calendar = getattr(tvar, "calendar", "standard")
dts = num2date(tvals, units=units, calendar=calendar)

first = dts[0]
last = dts[-1]
# points are often monthly midpoints; we compare month precision by default
return (first.year, first.month), (last.year, last.month), False, None
return (
(first.year, first.month, first.day,
first.hour, first.minute, first.second),
(last.year, last.month, last.day,
last.hour, last.minute, last.second),
None,
)
except Exception as e:
return None, None, None, f"Error converting time values: {e}"
return None, None, f"Error converting time values: {e}"


# Number of (Y, M, D, h, m, s) tuple components for each filename
# token precision. e.g. precision=8 (YYYYMMDD) -> 3 components.
_PRECISION_TO_COMPONENTS = {
4: 1, 6: 2, 8: 3, 10: 4, 12: 5, 14: 6,
}


# Frequencies for which no time range token is expected in the filename.
Expand All @@ -110,7 +161,7 @@ def check_time_range_vs_filename(ds, severity=BaseCheck.MEDIUM):
frequency = getattr(ds, "frequency", None)

filename = os.path.basename(ds.filepath())
start_str, end_str, use_day_from_name = _extract_time_range_from_filename(filename)
start_str, end_str, precision = _extract_time_range_from_filename(filename)

if not start_str or not end_str:
# No time range token found in filename.
Expand All @@ -127,7 +178,8 @@ def check_time_range_vs_filename(ds, severity=BaseCheck.MEDIUM):
# Any other frequency should have a time range token — this is a real error.
ctx.add_failure(
f"No time range token found in filename, but frequency='{frequency}' "
"requires a time range (e.g. '_YYYYMM-YYYYMM.nc' or '_YYYYMMDD-YYYYMMDD.nc')."
"requires a time range (e.g. '_YYYY-YYYY.nc', '_YYYYMM-YYYYMM.nc', "
"'_YYYYMMDD-YYYYMMDD.nc', or '_YYYYMMDDhhmm-YYYYMMDDhhmm.nc')."
)
return [ctx.to_result()]

Expand All @@ -138,16 +190,17 @@ def check_time_range_vs_filename(ds, severity=BaseCheck.MEDIUM):
ctx.add_failure(f"Error parsing time range from filename: {e}")
return [ctx.to_result()]

cov_start, cov_end, cov_use_day, err = _coverage_from_time(ds)
cov_start, cov_end, err = _coverage_from_time(ds)
if err:
ctx.add_failure(err)
return [ctx.to_result()]

# Compare at month precision if filename is YYYYMM-YYYYMM
if not use_day_from_name:
# normalize coverage to (Y,M)
cov_start = (cov_start[0], cov_start[1])
cov_end = (cov_end[0], cov_end[1])
# Truncate the (Y, M, D, h, m, s) coverage tuple to the filename's
# precision so comparisons line up. e.g. YYYYMM filename → compare
# only (Y, M).
ncomp = _PRECISION_TO_COMPONENTS.get(precision, 3)
cov_start = cov_start[:ncomp]
cov_end = cov_end[:ncomp]

# Fail if dataset starts after expected start OR ends before expected end
if cov_start > expected_start or cov_end < expected_end:
Expand All @@ -157,4 +210,4 @@ def check_time_range_vs_filename(ds, severity=BaseCheck.MEDIUM):
else:
ctx.add_pass()

return [ctx.to_result()]
return [ctx.to_result()]