diff --git a/src/ros2_medkit_fault_manager/CMakeLists.txt b/src/ros2_medkit_fault_manager/CMakeLists.txt index 9421ed7ce..9852683c8 100644 --- a/src/ros2_medkit_fault_manager/CMakeLists.txt +++ b/src/ros2_medkit_fault_manager/CMakeLists.txt @@ -122,6 +122,7 @@ if(BUILD_TESTING) find_package(ament_lint_auto REQUIRED) find_package(ament_cmake_gtest REQUIRED) find_package(launch_testing_ament_cmake REQUIRED) + find_package(std_msgs REQUIRED) set(ament_cmake_clang_format_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/../../.clang-format") list(APPEND AMENT_LINT_AUTO_EXCLUDE ament_cmake_uncrustify ament_cmake_cpplint ament_cmake_clang_tidy) @@ -138,7 +139,7 @@ if(BUILD_TESTING) # in the same CTest invocation and launch their own fault_manager_node subprocess. ament_add_gtest(test_fault_manager test/test_fault_manager.cpp) target_link_libraries(test_fault_manager fault_manager_lib) - medkit_target_dependencies(test_fault_manager rclcpp ros2_medkit_msgs) + medkit_target_dependencies(test_fault_manager rclcpp ros2_medkit_msgs std_msgs) medkit_set_test_domain(test_fault_manager) # SQLite storage tests @@ -154,7 +155,7 @@ if(BUILD_TESTING) # Snapshot capture tests ament_add_gtest(test_snapshot_capture test/test_snapshot_capture.cpp) target_link_libraries(test_snapshot_capture fault_manager_lib) - medkit_target_dependencies(test_snapshot_capture rclcpp ros2_medkit_msgs) + medkit_target_dependencies(test_snapshot_capture rclcpp ros2_medkit_msgs std_msgs) medkit_set_test_domain(test_snapshot_capture) # Rosbag capture tests diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index f376f8dd1..1537269f3 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -52,6 +52,7 @@ ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \ - **Persistent storage**: SQLite backend ensures faults survive node restarts - **Debounce filtering** (optional): AUTOSAR DEM-style counter-based fault confirmation with per-entity threshold overrides - **Snapshot capture**: Captures topic data when faults are confirmed for debugging (snapshots are deleted when fault is cleared) +- **Freeze-frame retention**: One compact JSON freeze-frame per fault code, retained across `clear_fault` (see below) - **Fault correlation** (optional): Root cause analysis with symptom muting and auto-clear - **Tamper-evident audit log** (optional): Append-only, hash-chained record of fault state transitions for verifiable history @@ -71,6 +72,8 @@ ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \ Snapshots capture topic data when faults are confirmed for post-mortem debugging. +Each confirm also writes a **freeze-frame**: a single compact JSON object mapping every captured topic to its value at confirmation time, keyed by fault code. It differs from per-topic snapshots in two ways: snapshots are deleted when the fault is cleared, while the freeze-frame is retained across `clear_fault` (once the snapshots are gone, `~/get_fault` serves the retained frame so the confirmed-state record stays available after acknowledgement); and a re-confirm that captures nothing (e.g. source publishers down) never overwrites an existing non-empty frame. A fault code with no configured capture set gets no freeze-frame row; a configured capture that samples nothing on its first run records an empty `{}` frame. Freeze-frame storage is bounded by the number of distinct fault codes (one row per code, replaced in place) and rows are never evicted. + Under a fault storm, captures are bounded by a worker pool (`capture_pool_size`) draining a bounded queue (`capture_queue_depth`); excess captures are dropped per `capture_queue_full_policy` and logged (throttled). The pool is shared and is created when snapshots **or** rosbag is enabled, so these parameters bound both. `capture_pool_size` parallelizes freeze-frame snapshot capture only - rosbag is single-writer and records one fault at a time regardless of pool size. | Parameter | Type | Default | Description | diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp index 68b2f994c..25f7b0dc0 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp @@ -110,6 +110,18 @@ struct SnapshotData { int64_t captured_at_ns{0}; }; +/// Compact freeze-frame captured when a fault confirms: a single JSON object mapping +/// each captured topic to its latest value at confirmation time. Unlike per-topic +/// snapshots, a freeze-frame is keyed by fault_code (one row per code) and is RETAINED +/// across clear_fault, so the confirmed-state record persists after acknowledgement. +/// A row exists only for fault codes with a configured capture set; a fault code with +/// no capture configured gets no row at all (lookup returns nullopt, never an empty {}). +struct FreezeFrameData { + std::string fault_code; + std::string data; ///< Compact JSON object: {"": , ...} + int64_t captured_at_ns{0}; +}; + /// Rosbag file metadata for time-window recording struct RosbagFileInfo { std::string fault_code; @@ -189,6 +201,21 @@ class FaultStorage { virtual std::vector get_snapshots(const std::string & fault_code, const std::string & topic_filter = "") const = 0; + /// Store the compact freeze-frame captured for a fault (JSON dict of topic values). + /// Keyed by fault_code: a later capture for the same code replaces the frame. The frame + /// is retained across clear_fault so the confirmed-state record survives acknowledgement. + /// Storage is bounded by the number of distinct fault codes (one row per code, replaced + /// in place); rows are never evicted. Faults themselves are never deleted (clear_fault + /// only flips status), so there is currently no delete hook to tie eviction to. + /// @param frame The freeze-frame to store + virtual void store_freeze_frame(const FreezeFrameData & frame) = 0; + + /// Get the freeze-frame captured for a fault, if any. + /// @param fault_code The fault code to look up + /// @return The freeze-frame if one was captured, nullopt otherwise (including fault + /// codes with no capture configured, which never get a row) + virtual std::optional get_freeze_frame(const std::string & fault_code) const = 0; + /// Store rosbag file metadata for a fault /// @param info The rosbag file info to store (replaces any existing entry for fault_code) virtual void store_rosbag_file(const RosbagFileInfo & info) = 0; @@ -267,6 +294,9 @@ class InMemoryFaultStorage : public FaultStorage { std::vector get_snapshots(const std::string & fault_code, const std::string & topic_filter = "") const override; + void store_freeze_frame(const FreezeFrameData & frame) override; + std::optional get_freeze_frame(const std::string & fault_code) const override; + void store_rosbag_file(const RosbagFileInfo & info) override; std::optional get_rosbag_file(const std::string & fault_code) const override; bool delete_rosbag_file(const std::string & fault_code) override; @@ -283,7 +313,8 @@ class InMemoryFaultStorage : public FaultStorage { mutable std::mutex mutex_; std::map faults_; std::vector snapshots_; - std::map rosbag_files_; ///< fault_code -> rosbag info + std::map freeze_frames_; ///< fault_code -> freeze-frame (retained across clear) + std::map rosbag_files_; ///< fault_code -> rosbag info DebounceConfig config_; size_t max_snapshots_per_fault_{0}; ///< 0 = unlimited }; diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/snapshot_capture.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/snapshot_capture.hpp index 2e375e195..91051f0f6 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/snapshot_capture.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/snapshot_capture.hpp @@ -22,6 +22,8 @@ #include #include +#include + #include "rclcpp/rclcpp.hpp" #include "ros2_medkit_fault_manager/fault_storage.hpp" @@ -147,6 +149,10 @@ class SnapshotCapture { SnapshotCapture & operator=(SnapshotCapture &&) = delete; /// Capture snapshots for a fault that was just confirmed + /// + /// If the fault code resolves to no capture set (not in fault_specific, no pattern + /// match, no default_topics), capture returns early: no freeze_frames row is written + /// (no empty {} row) and FaultStorage::get_freeze_frame() returns nullopt for it. /// @param fault_code The fault code that was confirmed void capture(const std::string & fault_code); @@ -166,12 +172,16 @@ class SnapshotCapture { std::vector resolve_topics(const std::string & fault_code) const; /// Capture a single topic on-demand (creates temporary subscription) + /// On success also records the captured value into @p freeze_frame under the topic key. /// @return true if capture was successful - bool capture_topic_on_demand(const std::string & fault_code, const std::string & topic); + bool capture_topic_on_demand(const std::string & fault_code, const std::string & topic, + nlohmann::json & freeze_frame); /// Capture a topic from background cache + /// On success also records the cached value into @p freeze_frame under the topic key. /// @return true if data was available in cache - bool capture_topic_from_cache(const std::string & fault_code, const std::string & topic); + bool capture_topic_from_cache(const std::string & fault_code, const std::string & topic, + nlohmann::json & freeze_frame); /// Initialize background subscriptions for all configured topics void init_background_subscriptions(); diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp index 2db560fa3..674e63baf 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp @@ -67,6 +67,9 @@ class SqliteFaultStorage : public FaultStorage { std::vector get_snapshots(const std::string & fault_code, const std::string & topic_filter = "") const override; + void store_freeze_frame(const FreezeFrameData & frame) override; + std::optional get_freeze_frame(const std::string & fault_code) const override; + void store_rosbag_file(const RosbagFileInfo & info) override; std::optional get_rosbag_file(const std::string & fault_code) const override; bool delete_rosbag_file(const std::string & fault_code) override; diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 7053258f1..2ced5abb2 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -988,6 +988,22 @@ void FaultManagerNode::handle_get_fault(const std::shared_ptrenvironment_data.snapshots.push_back(snapshot); } + // Per-topic snapshots are deleted on clear_fault, but the compact freeze-frame row is + // retained. When no snapshots remain, serve the retained frame so the confirmed-state + // record stays observable after acknowledgement. + if (stored_snapshots.empty()) { + auto frame = storage_->get_freeze_frame(request->fault_code); + if (frame) { + ros2_medkit_msgs::msg::Snapshot snapshot; + snapshot.type = ros2_medkit_msgs::msg::Snapshot::TYPE_FREEZE_FRAME; + snapshot.name = "freeze_frame"; + snapshot.data = frame->data; + // topic/message_type left empty: the frame aggregates values from multiple topics + snapshot.captured_at_ns = frame->captured_at_ns; + response->environment_data.snapshots.push_back(snapshot); + } + } + // Get rosbag info if available auto rosbag_info = storage_->get_rosbag_file(request->fault_code); if (rosbag_info) { diff --git a/src/ros2_medkit_fault_manager/src/fault_storage.cpp b/src/ros2_medkit_fault_manager/src/fault_storage.cpp index e89564844..269bb9f93 100644 --- a/src/ros2_medkit_fault_manager/src/fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_storage.cpp @@ -365,6 +365,20 @@ std::vector InMemoryFaultStorage::get_snapshots(const std::string return result; } +void InMemoryFaultStorage::store_freeze_frame(const FreezeFrameData & frame) { + std::lock_guard lock(mutex_); + freeze_frames_[frame.fault_code] = frame; +} + +std::optional InMemoryFaultStorage::get_freeze_frame(const std::string & fault_code) const { + std::lock_guard lock(mutex_); + auto it = freeze_frames_.find(fault_code); + if (it == freeze_frames_.end()) { + return std::nullopt; + } + return it->second; +} + void InMemoryFaultStorage::store_rosbag_file(const RosbagFileInfo & info) { std::lock_guard lock(mutex_); diff --git a/src/ros2_medkit_fault_manager/src/snapshot_capture.cpp b/src/ros2_medkit_fault_manager/src/snapshot_capture.cpp index 625d0c046..14b715107 100644 --- a/src/ros2_medkit_fault_manager/src/snapshot_capture.cpp +++ b/src/ros2_medkit_fault_manager/src/snapshot_capture.cpp @@ -124,6 +124,10 @@ void SnapshotCapture::capture(const std::string & fault_code) { auto topics = resolve_topics(fault_code); if (topics.empty()) { + // Unconfigured fault code: resolve_topics() yielded nothing, so capture returns early. + // No freeze_frames row is written (we never persist an empty {} row) and no + // subscriptions are created. A freeze-frame lookup for such a fault returns not-found + // (get_freeze_frame() == nullopt); absence of a row means "no capture configured". RCLCPP_DEBUG(node_->get_logger(), "No topics configured for fault '%s'", fault_code.c_str()); return; } @@ -131,13 +135,15 @@ void SnapshotCapture::capture(const std::string & fault_code) { RCLCPP_INFO(node_->get_logger(), "Capturing snapshots for fault '%s' (%zu topics)", fault_code.c_str(), topics.size()); + // Accumulate a compact dict of captured topic values alongside the per-topic snapshots. + nlohmann::json freeze_frame = nlohmann::json::object(); size_t captured_count = 0; for (const auto & topic : topics) { bool success = false; if (config_.background_capture) { - success = capture_topic_from_cache(fault_code, topic); + success = capture_topic_from_cache(fault_code, topic, freeze_frame); } else { - success = capture_topic_on_demand(fault_code, topic); + success = capture_topic_on_demand(fault_code, topic, freeze_frame); } if (success) { @@ -147,6 +153,27 @@ void SnapshotCapture::capture(const std::string & fault_code) { RCLCPP_INFO(node_->get_logger(), "Captured %zu/%zu snapshots for fault '%s'", captured_count, topics.size(), fault_code.c_str()); + + // Persist the compact freeze-frame keyed by fault_code (retained across clear_fault). + // Only reached for faults with a configured capture set (unconfigured codes returned + // early above and get no row). If nothing was publishing at confirmation time the row + // holds an empty object, recording that a configured capture ran but sampled nothing. + // An empty capture must never clobber an existing non-empty frame: a re-confirm while + // the source publishers are down would otherwise replace the retained record with {}. + if (captured_count == 0) { + auto existing = storage_->get_freeze_frame(fault_code); + if (existing.has_value() && existing->data != "{}") { + RCLCPP_WARN(node_->get_logger(), "Nothing captured for fault '%s'; keeping previously retained freeze-frame", + fault_code.c_str()); + return; + } + } + + FreezeFrameData frame; + frame.fault_code = fault_code; + frame.data = freeze_frame.dump(); + frame.captured_at_ns = get_wall_clock_ns(); + storage_->store_freeze_frame(frame); } std::vector SnapshotCapture::resolve_topics(const std::string & fault_code) const { @@ -174,7 +201,8 @@ std::vector SnapshotCapture::resolve_topics(const std::string & fau return {}; } -bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, const std::string & topic) { +bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, const std::string & topic, + nlohmann::json & freeze_frame) { // Get topic type std::string msg_type = get_topic_type(topic); if (msg_type.empty()) { @@ -286,6 +314,9 @@ bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, co storage_->store_snapshot(snapshot); + // Record the value into the compact freeze-frame dict under the topic key. + freeze_frame[topic] = std::move(json_data); + RCLCPP_DEBUG(node_->get_logger(), "Captured snapshot from '%s' for fault '%s'", topic.c_str(), fault_code.c_str()); return true; @@ -300,7 +331,8 @@ bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, co return false; } -bool SnapshotCapture::capture_topic_from_cache(const std::string & fault_code, const std::string & topic) { +bool SnapshotCapture::capture_topic_from_cache(const std::string & fault_code, const std::string & topic, + nlohmann::json & freeze_frame) { std::lock_guard lock(cache_mutex_); auto it = message_cache_.find(topic); @@ -321,6 +353,15 @@ bool SnapshotCapture::capture_topic_from_cache(const std::string & fault_code, c storage_->store_snapshot(snapshot); + // Record the cached value into the compact freeze-frame dict. The cache holds a serialized + // JSON string; parse it back so the frame nests structured values (fall back to the raw + // string only if it somehow fails to parse). + try { + freeze_frame[topic] = nlohmann::json::parse(cached.data); + } catch (const nlohmann::json::parse_error &) { + freeze_frame[topic] = cached.data; + } + RCLCPP_DEBUG(node_->get_logger(), "Captured snapshot from cache for '%s' (fault '%s')", topic.c_str(), fault_code.c_str()); return true; diff --git a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp index 76482987b..f6743afed 100644 --- a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp @@ -188,6 +188,23 @@ void SqliteFaultStorage::initialize_schema() { throw std::runtime_error("Failed to create snapshots table: " + error); } + // Create freeze_frames table: one compact JSON dict of captured topic values per fault + // code. Unlike snapshots, freeze frames are keyed by fault_code and are NOT removed on + // clear_fault, so the confirmed-state record is retained after acknowledgement. + const char * create_freeze_frames_table_sql = R"( + CREATE TABLE IF NOT EXISTS freeze_frames ( + fault_code TEXT PRIMARY KEY, + data TEXT NOT NULL, + captured_at_ns INTEGER NOT NULL + ); + )"; + + if (sqlite3_exec(db_, create_freeze_frames_table_sql, nullptr, nullptr, &err_msg) != SQLITE_OK) { + std::string error = err_msg ? err_msg : "Unknown error"; + sqlite3_free(err_msg); + throw std::runtime_error("Failed to create freeze_frames table: " + error); + } + // Create rosbag_files table for storing time-window bag file metadata const char * create_rosbag_files_table_sql = R"( CREATE TABLE IF NOT EXISTS rosbag_files ( @@ -795,6 +812,39 @@ std::vector SqliteFaultStorage::get_snapshots(const std::string & return result; } +void SqliteFaultStorage::store_freeze_frame(const FreezeFrameData & frame) { + std::lock_guard lock(mutex_); + + // Keyed by fault_code (PRIMARY KEY): a re-confirm replaces the previous frame. + SqliteStatement stmt(db_, + "INSERT OR REPLACE INTO freeze_frames (fault_code, data, captured_at_ns) " + "VALUES (?, ?, ?)"); + stmt.bind_text(1, frame.fault_code); + stmt.bind_text(2, frame.data); + stmt.bind_int64(3, frame.captured_at_ns); + + if (stmt.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("Failed to store freeze frame: ") + sqlite3_errmsg(db_)); + } +} + +std::optional SqliteFaultStorage::get_freeze_frame(const std::string & fault_code) const { + std::lock_guard lock(mutex_); + + SqliteStatement stmt(db_, "SELECT fault_code, data, captured_at_ns FROM freeze_frames WHERE fault_code = ?"); + stmt.bind_text(1, fault_code); + + if (stmt.step() != SQLITE_ROW) { + return std::nullopt; + } + + FreezeFrameData frame; + frame.fault_code = stmt.column_text(0); + frame.data = stmt.column_text(1); + frame.captured_at_ns = stmt.column_int64(2); + return frame; +} + void SqliteFaultStorage::store_rosbag_file(const RosbagFileInfo & info) { std::lock_guard lock(mutex_); diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 3eb477b2e..e2c23bc15 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -25,6 +25,9 @@ #include #include +#include +#include + #include "rclcpp/rclcpp.hpp" #include "ros2_medkit_fault_manager/fault_audit_log.hpp" #include "ros2_medkit_fault_manager/fault_manager_node.hpp" @@ -32,6 +35,7 @@ #include "ros2_medkit_fault_manager/sqlite_fault_storage.hpp" #include "ros2_medkit_msgs/msg/fault.hpp" #include "ros2_medkit_msgs/msg/fault_event.hpp" +#include "ros2_medkit_msgs/msg/snapshot.hpp" #include "ros2_medkit_msgs/srv/clear_fault.hpp" #include "ros2_medkit_msgs/srv/get_fault.hpp" #include "ros2_medkit_msgs/srv/list_faults_for_entity.hpp" @@ -918,15 +922,21 @@ class FaultEventPublishingTest : public ::testing::Test { protected: static inline std::atomic test_counter_{0}; + /// Fault manager parameter overrides; subclasses extend to enable capture features. + virtual std::vector fault_manager_overrides() { + return { + rclcpp::Parameter("storage_type", "memory"), + rclcpp::Parameter("confirmation_threshold", -1), // Immediate confirmation + }; + } + void SetUp() override { // Unique namespace per test iteration avoids DDS topic collisions std::string ns = "/test_events_" + std::to_string(test_counter_.fetch_add(1)); // Create fault manager node with immediate confirmation rclcpp::NodeOptions fm_options; - fm_options.parameter_overrides({ - {"storage_type", "memory"}, {"confirmation_threshold", -1}, // Immediate confirmation - }); + fm_options.parameter_overrides(fault_manager_overrides()); fm_options.arguments({"--ros-args", "-r", "__ns:=" + ns}); fault_manager_ = std::make_shared(fm_options); @@ -1321,6 +1331,57 @@ TEST_F(FaultEventPublishingTest, ListFaultsForEntityWithEmptyId) { EXPECT_FALSE(response->error_message.empty()); } +// Freeze-frame retention: after clear_fault deletes the per-topic snapshots, GetFault +// must keep serving the retained freeze_frames row (the only surviving record). +class FreezeFrameRetentionTest : public FaultEventPublishingTest { + protected: + std::vector fault_manager_overrides() override { + auto overrides = FaultEventPublishingTest::fault_manager_overrides(); + overrides.emplace_back("snapshots.enabled", true); + overrides.emplace_back("snapshots.timeout_sec", 5.0); + overrides.emplace_back("snapshots.default_topics", std::vector{"/ff_pressure"}); + return overrides; + } +}; + +TEST_F(FreezeFrameRetentionTest, GetFaultServesRetainedFreezeFrameAfterClear) { + auto pub = test_node_->create_publisher("/ff_pressure", rclcpp::QoS(10)); + auto timer = test_node_->create_wall_timer(std::chrono::milliseconds(20), [&pub]() { + std_msgs::msg::Float64 msg; + msg.data = 91.25; + pub->publish(msg); + }); + + // Wait until the fault manager can discover the publisher, then confirm the fault. + ASSERT_TRUE(spin_until([this]() { + return fault_manager_->count_publishers("/ff_pressure") > 0; + })); + ASSERT_TRUE(call_report_fault("FF_FAULT", Fault::SEVERITY_ERROR, "/test_node")); + + // Capture runs asynchronously on the pool; wait for a non-empty freeze-frame. + ASSERT_TRUE(spin_until( + [this]() { + auto frame = fault_manager_->get_storage().get_freeze_frame("FF_FAULT"); + return frame.has_value() && frame->data != "{}"; + }, + std::chrono::milliseconds(10000))); + + ASSERT_TRUE(call_clear_fault("FF_FAULT")); + + auto response = call_get_fault("FF_FAULT"); + ASSERT_TRUE(response.has_value()); + ASSERT_TRUE(response->success); + + // Per-topic snapshots were deleted on clear; the retained frame is served instead. + ASSERT_EQ(response->environment_data.snapshots.size(), 1u); + const auto & snapshot = response->environment_data.snapshots[0]; + EXPECT_EQ(snapshot.type, ros2_medkit_msgs::msg::Snapshot::TYPE_FREEZE_FRAME); + EXPECT_EQ(snapshot.name, "freeze_frame"); + auto parsed = nlohmann::json::parse(snapshot.data); + ASSERT_TRUE(parsed.contains("/ff_pressure")); + EXPECT_DOUBLE_EQ(parsed["/ff_pressure"]["data"].get(), 91.25); +} + // matches_entity helper tests TEST(MatchesEntityTest, ExactMatch) { std::vector sources = {"motor_controller"}; @@ -1397,6 +1458,76 @@ TEST(InMemorySnapshotLimitTest, UnlimitedWhenZero) { EXPECT_EQ(result.size(), 20u); } +// --- InMemoryFaultStorage freeze-frame tests --- + +TEST(InMemoryFreezeFrameTest, StoreAndRetrieve) { + InMemoryFaultStorage storage; + + ros2_medkit_fault_manager::FreezeFrameData frame; + frame.fault_code = "PLC_PRESSURE_HIGH"; + frame.data = R"({"/plc/pressure":{"data":8.4}})"; + frame.captured_at_ns = 1000; + storage.store_freeze_frame(frame); + + auto retrieved = storage.get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->data, frame.data); + EXPECT_EQ(retrieved->captured_at_ns, 1000); +} + +TEST(InMemoryFreezeFrameTest, AbsentForUnknownFault) { + InMemoryFaultStorage storage; + EXPECT_FALSE(storage.get_freeze_frame("NEVER_CAPTURED").has_value()); +} + +TEST(InMemoryFreezeFrameTest, ReplacedOnRecapture) { + InMemoryFaultStorage storage; + + ros2_medkit_fault_manager::FreezeFrameData frame; + frame.fault_code = "PLC_PRESSURE_HIGH"; + frame.data = R"({"/plc/pressure":{"data":8.4}})"; + frame.captured_at_ns = 1000; + storage.store_freeze_frame(frame); + + frame.data = R"({"/plc/pressure":{"data":9.9}})"; + frame.captured_at_ns = 2000; + storage.store_freeze_frame(frame); + + auto retrieved = storage.get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->data, R"({"/plc/pressure":{"data":9.9}})"); + EXPECT_EQ(retrieved->captured_at_ns, 2000); +} + +TEST(InMemoryFreezeFrameTest, SurvivesClearFault) { + InMemoryFaultStorage storage; + rclcpp::Clock clock; + + storage.report_fault_event("PLC_PRESSURE_HIGH", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, + "Pressure high", "/plc_node", clock.now(), DebounceConfig{}); + + ros2_medkit_fault_manager::SnapshotData snap; + snap.fault_code = "PLC_PRESSURE_HIGH"; + snap.topic = "/plc/pressure"; + snap.message_type = "std_msgs/msg/Float64"; + snap.data = R"({"data":8.4})"; + snap.captured_at_ns = 1000; + storage.store_snapshot(snap); + + ros2_medkit_fault_manager::FreezeFrameData frame; + frame.fault_code = "PLC_PRESSURE_HIGH"; + frame.data = R"({"/plc/pressure":{"data":8.4}})"; + frame.captured_at_ns = 1000; + storage.store_freeze_frame(frame); + + ASSERT_TRUE(storage.clear_fault("PLC_PRESSURE_HIGH")); + + EXPECT_TRUE(storage.get_snapshots("PLC_PRESSURE_HIGH").empty()); + auto retrieved = storage.get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->data, frame.data); +} + // ============================================================================= // Snapshot recapture cooldown test // ============================================================================= diff --git a/src/ros2_medkit_fault_manager/test/test_snapshot_capture.cpp b/src/ros2_medkit_fault_manager/test/test_snapshot_capture.cpp index b8eaac3d9..0cfc13b2a 100644 --- a/src/ros2_medkit_fault_manager/test/test_snapshot_capture.cpp +++ b/src/ros2_medkit_fault_manager/test/test_snapshot_capture.cpp @@ -14,10 +14,17 @@ #include +#include +#include +#include #include #include +#include #include +#include +#include + #include "rclcpp/rclcpp.hpp" #include "ros2_medkit_fault_manager/fault_storage.hpp" #include "ros2_medkit_fault_manager/snapshot_capture.hpp" @@ -223,6 +230,202 @@ TEST_F(SnapshotCaptureTest, OnDemandCaptureHandlesNonExistentTopic) { // Should timeout gracefully, no snapshot stored auto snapshots = storage_->get_snapshots("TEST_FAULT"); EXPECT_TRUE(snapshots.empty()); + + // A configured capture that sampled nothing still records an empty {} frame. + auto frame = storage_->get_freeze_frame("TEST_FAULT"); + ASSERT_TRUE(frame.has_value()); + EXPECT_EQ(frame->data, "{}"); +} + +// Freeze-frame end-to-end tests + +// Stops and joins a publisher thread on scope exit, so an assertion failure +// mid-test never destroys a joinable std::thread (which would std::terminate). +struct ScopedPublisherThread { + std::atomic stop{false}; + std::thread thread; + + explicit ScopedPublisherThread(std::function &)> body) + : thread([this, body = std::move(body)]() { + body(stop); + }) { + } + + ~ScopedPublisherThread() { + stop.store(true); + if (thread.joinable()) { + thread.join(); + } + } +}; + +// @verifies REQ_INTEROP_088 +TEST_F(SnapshotCaptureTest, CaptureWritesFreezeFrameFromConfiguredTopic) { + auto pub = node_->create_publisher("/plc/pressure", rclcpp::QoS(10)); + + SnapshotConfig config; + config.enabled = true; + config.background_capture = false; + config.timeout_sec = 5.0; + config.fault_specific["PLC_PRESSURE_HIGH"] = {"/plc/pressure"}; + SnapshotCapture capture(node_.get(), storage_.get(), config); + + // Publish continuously so the on-demand one-shot subscription catches a value. + ScopedPublisherThread pub_thread([&pub](std::atomic & stop) { + while (!stop.load()) { + std_msgs::msg::Float64 msg; + msg.data = 42.5; + pub->publish(msg); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + }); + + // Wait for the publisher to be visible to the capture node before capturing. + auto start = std::chrono::steady_clock::now(); + while (node_->count_publishers("/plc/pressure") == 0 && + std::chrono::steady_clock::now() - start < std::chrono::seconds(5)) { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + ASSERT_GT(node_->count_publishers("/plc/pressure"), 0u); + + capture.capture("PLC_PRESSURE_HIGH"); + + auto frame = storage_->get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(frame.has_value()); + auto parsed = nlohmann::json::parse(frame->data); + ASSERT_TRUE(parsed.contains("/plc/pressure")); + EXPECT_DOUBLE_EQ(parsed["/plc/pressure"]["data"].get(), 42.5); +} + +// Regression for the empty-recapture overwrite: a re-confirm while the source +// publishers are down must not clobber a previously retained non-empty frame. +// @verifies REQ_INTEROP_088 +TEST_F(SnapshotCaptureTest, EmptyRecaptureKeepsRetainedFreezeFrame) { + auto pub = node_->create_publisher("/plc/flow", rclcpp::QoS(10)); + + SnapshotConfig config; + config.enabled = true; + config.background_capture = false; + config.timeout_sec = 5.0; + config.fault_specific["PLC_FLOW_LOW"] = {"/plc/flow"}; + SnapshotCapture capture(node_.get(), storage_.get(), config); + + { + ScopedPublisherThread pub_thread([&pub](std::atomic & stop) { + while (!stop.load()) { + std_msgs::msg::Float64 msg; + msg.data = 7.25; + pub->publish(msg); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + }); + + auto start = std::chrono::steady_clock::now(); + while (node_->count_publishers("/plc/flow") == 0 && + std::chrono::steady_clock::now() - start < std::chrono::seconds(5)) { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + ASSERT_GT(node_->count_publishers("/plc/flow"), 0u); + + capture.capture("PLC_FLOW_LOW"); + } + + auto frame = storage_->get_freeze_frame("PLC_FLOW_LOW"); + ASSERT_TRUE(frame.has_value()); + ASSERT_NE(frame->data, "{}"); + + // Take the publisher down and wait until the graph reflects it, then re-capture. + pub.reset(); + auto start = std::chrono::steady_clock::now(); + while (node_->count_publishers("/plc/flow") > 0 && + std::chrono::steady_clock::now() - start < std::chrono::seconds(5)) { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + ASSERT_EQ(node_->count_publishers("/plc/flow"), 0u); + + capture.capture("PLC_FLOW_LOW"); + + auto retained = storage_->get_freeze_frame("PLC_FLOW_LOW"); + ASSERT_TRUE(retained.has_value()); + auto parsed = nlohmann::json::parse(retained->data); + ASSERT_TRUE(parsed.contains("/plc/flow")); + EXPECT_DOUBLE_EQ(parsed["/plc/flow"]["data"].get(), 7.25); +} + +// Exercises the background-capture cache path into the freeze-frame (cached JSON +// string parsed back into a structured value). +// @verifies REQ_INTEROP_088 +TEST_F(SnapshotCaptureTest, BackgroundCaptureCachesFreezeFrame) { + auto pub = node_->create_publisher("/plc/temperature", rclcpp::QoS(10)); + + ScopedPublisherThread pub_thread([&pub](std::atomic & stop) { + while (!stop.load()) { + std_msgs::msg::Float64 msg; + msg.data = 91.5; + pub->publish(msg); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + }); + + // The publisher already exists on this node, so the topic type is resolvable + // when the constructor sets up the background subscription. + SnapshotConfig config; + config.enabled = true; + config.background_capture = true; + config.fault_specific["PLC_TEMP_HIGH"] = {"/plc/temperature"}; + SnapshotCapture capture(node_.get(), storage_.get(), config); + + // Spin so the background subscription caches a message, then capture from cache. + // Until the cache fills, capture records an empty first-run {} frame; retry. + bool got_frame = false; + auto start = std::chrono::steady_clock::now(); + while (!got_frame && std::chrono::steady_clock::now() - start < std::chrono::seconds(10)) { + rclcpp::spin_some(node_); + capture.capture("PLC_TEMP_HIGH"); + auto frame = storage_->get_freeze_frame("PLC_TEMP_HIGH"); + got_frame = frame.has_value() && frame->data != "{}"; + if (!got_frame) { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + } + ASSERT_TRUE(got_frame); + + auto frame = storage_->get_freeze_frame("PLC_TEMP_HIGH"); + ASSERT_TRUE(frame.has_value()); + auto parsed = nlohmann::json::parse(frame->data); + ASSERT_TRUE(parsed.contains("/plc/temperature")); + EXPECT_DOUBLE_EQ(parsed["/plc/temperature"]["data"].get(), 91.5); + + // The cache path also stores a per-topic snapshot. + auto snapshots = storage_->get_snapshots("PLC_TEMP_HIGH"); + ASSERT_FALSE(snapshots.empty()); + EXPECT_EQ(snapshots.back().topic, "/plc/temperature"); + EXPECT_EQ(snapshots.back().message_type, "std_msgs/msg/Float64"); +} + +// @verifies REQ_INTEROP_088 +TEST_F(SnapshotCaptureTest, UnconfiguredFaultWritesNoFreezeFrame) { + SnapshotConfig config; + config.enabled = true; + config.fault_specific["OTHER_FAULT"] = {"/plc/pressure"}; + // No default_topics: an unrelated fault code resolves to an empty capture set. + SnapshotCapture capture(node_.get(), storage_.get(), config); + + capture.capture("UNMAPPED_FAULT"); + + EXPECT_FALSE(storage_->get_freeze_frame("UNMAPPED_FAULT").has_value()); +} + +// @verifies REQ_INTEROP_088 +TEST_F(SnapshotCaptureTest, DisabledCaptureWritesNoFreezeFrame) { + SnapshotConfig config; + config.enabled = false; + config.default_topics = {"/plc/pressure"}; + SnapshotCapture capture(node_.get(), storage_.get(), config); + + capture.capture("ANY_FAULT"); + + EXPECT_FALSE(storage_->get_freeze_frame("ANY_FAULT").has_value()); } int main(int argc, char ** argv) { diff --git a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp index 736a9dd5d..586c3c7d1 100644 --- a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp +++ b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp @@ -879,6 +879,110 @@ TEST_F(SqliteFaultStorageTest, ClearFaultDeletesAssociatedSnapshots) { EXPECT_TRUE(snapshots_after.empty()); } +// Freeze-frame storage tests +// @verifies REQ_INTEROP_088 +TEST_F(SqliteFaultStorageTest, StoreAndRetrieveFreezeFrame) { + using ros2_medkit_fault_manager::FreezeFrameData; + rclcpp::Clock clock; + + storage_->report_fault_event("PLC_PRESSURE_HIGH", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, + "Pressure high", "/plc_node", clock.now(), default_config()); + + FreezeFrameData frame; + frame.fault_code = "PLC_PRESSURE_HIGH"; + frame.data = R"({"/plc/pressure":{"data":8.4},"/plc/valve":{"data":true}})"; + frame.captured_at_ns = clock.now().nanoseconds(); + storage_->store_freeze_frame(frame); + + auto retrieved = storage_->get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->fault_code, "PLC_PRESSURE_HIGH"); + EXPECT_EQ(retrieved->data, frame.data); + EXPECT_EQ(retrieved->captured_at_ns, frame.captured_at_ns); +} + +// @verifies REQ_INTEROP_088 +TEST_F(SqliteFaultStorageTest, NoFreezeFrameForUnknownFault) { + auto retrieved = storage_->get_freeze_frame("NEVER_CAPTURED"); + EXPECT_FALSE(retrieved.has_value()); +} + +// @verifies REQ_INTEROP_088 +TEST_F(SqliteFaultStorageTest, FreezeFrameReplacedOnRecapture) { + using ros2_medkit_fault_manager::FreezeFrameData; + rclcpp::Clock clock; + + FreezeFrameData first; + first.fault_code = "PLC_PRESSURE_HIGH"; + first.data = R"({"/plc/pressure":{"data":8.4}})"; + first.captured_at_ns = 1000; + storage_->store_freeze_frame(first); + + FreezeFrameData second; + second.fault_code = "PLC_PRESSURE_HIGH"; + second.data = R"({"/plc/pressure":{"data":9.9}})"; + second.captured_at_ns = 2000; + storage_->store_freeze_frame(second); + + auto retrieved = storage_->get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->data, second.data); + EXPECT_EQ(retrieved->captured_at_ns, 2000); +} + +// @verifies REQ_INTEROP_088 +TEST_F(SqliteFaultStorageTest, FreezeFrameSurvivesClearFault) { + using ros2_medkit_fault_manager::FreezeFrameData; + using ros2_medkit_fault_manager::SnapshotData; + rclcpp::Clock clock; + + storage_->report_fault_event("PLC_PRESSURE_HIGH", ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, + "Pressure high", "/plc_node", clock.now(), default_config()); + + // A per-topic snapshot (removed on clear) plus a freeze-frame (retained on clear). + SnapshotData snapshot; + snapshot.fault_code = "PLC_PRESSURE_HIGH"; + snapshot.topic = "/plc/pressure"; + snapshot.message_type = "std_msgs/msg/Float64"; + snapshot.data = R"({"data":8.4})"; + snapshot.captured_at_ns = clock.now().nanoseconds(); + storage_->store_snapshot(snapshot); + + FreezeFrameData frame; + frame.fault_code = "PLC_PRESSURE_HIGH"; + frame.data = R"({"/plc/pressure":{"data":8.4}})"; + frame.captured_at_ns = clock.now().nanoseconds(); + storage_->store_freeze_frame(frame); + + ASSERT_TRUE(storage_->clear_fault("PLC_PRESSURE_HIGH")); + + // Snapshots are wiped on clear, the freeze-frame is retained and still retrievable. + EXPECT_TRUE(storage_->get_snapshots("PLC_PRESSURE_HIGH").empty()); + auto retrieved = storage_->get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->data, frame.data); +} + +// @verifies REQ_INTEROP_088 +TEST_F(SqliteFaultStorageTest, FreezeFramePersistsAcrossReopen) { + using ros2_medkit_fault_manager::FreezeFrameData; + + FreezeFrameData frame; + frame.fault_code = "PLC_PRESSURE_HIGH"; + frame.data = R"({"/plc/pressure":{"data":8.4}})"; + frame.captured_at_ns = 4242; + storage_->store_freeze_frame(frame); + + // Reopen the same database file. + storage_.reset(); + storage_ = std::make_unique(temp_db_path_.string()); + + auto retrieved = storage_->get_freeze_frame("PLC_PRESSURE_HIGH"); + ASSERT_TRUE(retrieved.has_value()); + EXPECT_EQ(retrieved->data, frame.data); + EXPECT_EQ(retrieved->captured_at_ns, 4242); +} + // Rosbag entity-scoped listing tests // @verifies REQ_INTEROP_071