From 20ede1cf19476817b5e1d6111d7fdd8e960c3a59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 20:43:11 +0200 Subject: [PATCH 01/29] docs(testing): seed issue-135 writeup skeleton Adds docs/testing/issue-135.md with the per-phase writeup index for the MqttRelay availability topic fix and seeds the docs/testing/issue-135/ subfolder with the phase-00 foundation writeup. --- docs/testing/issue-135.md | 50 +++++++++++++++++++ docs/testing/issue-135/phase-00-foundation.md | 43 ++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 docs/testing/issue-135.md create mode 100644 docs/testing/issue-135/phase-00-foundation.md diff --git a/docs/testing/issue-135.md b/docs/testing/issue-135.md new file mode 100644 index 000000000..44d919645 --- /dev/null +++ b/docs/testing/issue-135.md @@ -0,0 +1,50 @@ +# Issue #135 — MqttRelay availability topic emission + +Tracking writeup for the fix to [TrakHound/MTConnect.NET#135](https://github.com/TrakHound/MTConnect.NET/issues/135). + +## 1. Defect and scope + +The `MTConnect.NET-AgentModule-MqttRelay` agent module publishes the +Agent Availability state on `{TopicPrefix}/Probe/{AgentUuid}/Available` +as both the MQTT Last Will and Testament (LWT) and as a retained +`AVAILABLE` / `UNAVAILABLE` UTF-8 string. This breaks the contract +that every topic under `{TopicPrefix}/Probe/#` carries a JSON document +envelope: a subscriber wildcarding on `Probe/#` and parsing every +payload as JSON throws on the raw availability string. + +Scope of this writeup is the `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/` +project only. No other module, library, or test project is in scope. + +## 2. Investigation + +See `docs/testing/issue-135/phase-01-defect-scoping.md`. + +## 3. Red tests + +See `docs/testing/issue-135/phase-02-red-tests.md`. + +## 4. Library fix + +See `docs/testing/issue-135/phase-03-library-fix.md`. + +## 5. Regression pins + +See `docs/testing/issue-135/phase-04-regression-pins.md`. + +## 6. End-to-end validation + +See `docs/testing/issue-135/phase-05-e2e-validation.md`. + +## 7. Summary + +See `docs/testing/issue-135/phase-06-summary.md`. + +## Operator migration + +After this fix, operators that previously subscribed to +`{TopicPrefix}/Probe/{AgentUuid}/Available` for the raw-string +availability state must move their subscription to +`{TopicPrefix}/Agent/{AgentUuid}/Available`. Subscribers of the +`{TopicPrefix}/Probe/#` wildcard will no longer receive the +availability message and can rely on the wildcard delivering only +JSON document envelopes (Probe). diff --git a/docs/testing/issue-135/phase-00-foundation.md b/docs/testing/issue-135/phase-00-foundation.md new file mode 100644 index 000000000..ae05e7fa8 --- /dev/null +++ b/docs/testing/issue-135/phase-00-foundation.md @@ -0,0 +1,43 @@ +# Phase 00 — foundation + +## Executed + +- Cut branch `fix/issue-135` from `upstream/master` at `3d6321ab Updated ReadMe`. +- Seeded `docs/testing/issue-135.md` with the writeup index and + per-phase placeholders. +- Created `docs/testing/issue-135/` for per-phase writeups. + +## Bootstrap dependency + +The plan author intended to consume bootstrap outputs (shared +`tests/coverlet.runsettings`, `tools/test.sh`, scaffolded test +projects) that have not yet landed on `upstream/master`. This branch +proceeds without those outputs: + +- Validation runs `dotnet build` and `dotnet test` directly against + the touched projects rather than `tools/test.sh`. +- Coverage is measured locally with `coverlet.collector` (already + present in every existing test project's csproj) and + `dotnet test --collect:"XPlat Code Coverage"`. +- The new `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` project + is created as part of this branch (P2) rather than consumed from + bootstrap. + +## Metrics delta + +None at this phase (documentation only). + +## Deviations from plan + +- The plan's `01-foundation.md` cites a `docs(issue-135): seed + writeup skeleton` commit subject. `issue-135` is not a recognised + scope under CONVENTIONS.md §5.3; the correct documentation scope + for `docs/testing//` is `testing`. This phase commits as + `docs(testing): seed issue-135 writeup skeleton`. +- The plan assumed bootstrap had landed. Since it has not, the + branch creates the paired test project itself in P2 instead of + consuming a bootstrap-scaffolded stub. + +## Follow-ups + +- None. From 644b2e59fb4fa9e899c9a90855169f5059de598d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 20:44:23 +0200 Subject: [PATCH 02/29] docs(testing): scope mqtt-relay availability topic defect Records the publish-surface investigation, the cppagent reference posture, and the strategy decision (relocate out of the Probe wildcard) for the issue-135 fix. --- .../issue-135/phase-01-defect-scoping.md | 94 +++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 docs/testing/issue-135/phase-01-defect-scoping.md diff --git a/docs/testing/issue-135/phase-01-defect-scoping.md b/docs/testing/issue-135/phase-01-defect-scoping.md new file mode 100644 index 000000000..bc413b2d3 --- /dev/null +++ b/docs/testing/issue-135/phase-01-defect-scoping.md @@ -0,0 +1,94 @@ +# Phase 01 — defect scoping + +## Publish surface + +`agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs` +contains all topic-construction sites that produce the broken +availability publishes: + +- Line 738-746 — `GetAgentAvailableTopic()` returns + `{TopicPrefix}/{ProbeTopic}/{Agent.Uuid}/Available` where + `ProbeTopic` resolves to the constant `Probe`. Result: + `{TopicPrefix}/Probe/{AgentUuid}/Available`. +- Line 119-123 — `clientOptionsBuilder.WithWillTopic(GetAgentAvailableTopic())` + configures the MQTT Last Will and Testament. Payload is + `Availability.UNAVAILABLE.ToString()` (raw UTF-8 bytes), QoS 1, + retained. +- Line 417-449 — when the `Agent` device's Probe document is + published, the same `GetAgentAvailableTopic()` is republished as + a retained message with payload `Availability.AVAILABLE.ToString()` + (raw UTF-8 bytes), QoS 1, retained. + +`MqttTopicStructure.cs` declares topic prefix constants but does +not contribute additional topic-construction sites. + +## cppagent reference + +The reference `mtconnect/cppagent` implementation does not publish a +dedicated raw-string availability topic under the Probe wildcard. +Agent availability is communicated via: + +- The MQTT Last Will and Testament fires a transport-level + notification on a topic outside the document wildcards (typical + layout: a status / agent topic, never under `Probe/#`). +- The normal observation flow encodes the `AVAILABILITY` DataItem + inside the JSON `Current` envelope when the agent is reachable. + +A subscriber wildcarding on `{TopicPrefix}/Probe/#` consequently +receives only JSON document envelopes from the cppagent reference. + +## Strategy decision + +**Strategy A — relocate the topic out of the Probe wildcard.** + +The fix relocates both the LWT and the retained message from +`{TopicPrefix}/Probe/{AgentUuid}/Available` to +`{TopicPrefix}/Agent/{AgentUuid}/Available`. The `Probe/#` wildcard +then carries only JSON document envelopes, matching the cppagent +contract. + +Strategy A was selected over the alternatives because: + +- Strategy B (JSON-wrap the payload in place) keeps the four-segment + topic shape under `Probe/#` which still violates the cppagent + three-segment Probe contract. Partial compliance only. +- Strategy C (eliminate the dedicated publish, rely on the + `AVAILABILITY` DataItem in the Current envelope) loses the MQTT + transport-level Last Will signal that operators rely on for fast + detection of unclean disconnects. + +## Breaking-change impact + +Strategy A is a breaking change for any operator that subscribes +directly to `{TopicPrefix}/Probe/{AgentUuid}/Available`. The fix +ships with an operator migration note in `docs/testing/issue-135.md` +and a `Breaking:` bullet in the PR body. + +A repo-wide grep for the legacy topic shape finds no in-repo +consumers (no examples, no fixtures, no documentation) — the +breaking surface is external to this repository. + +## Sources + +- `https://github.com/TrakHound/MTConnect.NET/issues/135` — defect + report describing the failed `Probe/#` JSON parse. +- `https://github.com/mtconnect/cppagent` — reference implementation + for the canonical MQTT topic layout under `Probe/#`. + +## Metrics delta + +None at this phase (documentation only). + +## Deviations from plan + +- The plan's `02-defect-scoping.md` cites a `docs(issue-135): scope + mqtt-relay availability topic defect` commit subject. `issue-135` + is not a recognised scope under CONVENTIONS.md §5.3; the correct + documentation scope is `testing` (per §5.4 rule 5: docs about a + feature use the feature's scope, but the writeups under + `docs/testing//` are plan-tracking artefacts and use the + `testing` documentation scope). + +## Follow-ups + +- None. From 79517808c76aabced5a24376830720516145f920 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 20:52:05 +0200 Subject: [PATCH 03/29] refactor(agent-module): extract availability topic helper in MqttRelay Extracts the agent availability topic-construction logic in the MqttRelay module to a new public static helper class AvailabilityTopic.Build(topicPrefix, agentUuid). Module.cs's GetAgentAvailableTopic delegates to the helper. No behavior change: the helper still emits the existing {TopicPrefix}/Probe/{AgentUuid}/Available shape so the broken topic contract reported in TrakHound/MTConnect.NET#135 is preserved for the incoming red-test commit. The subsequent fix flips the helper's output to the corrected shape. --- .../AvailabilityTopic.cs | 40 +++++++++++++++++++ .../Module.cs | 7 +--- 2 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs new file mode 100644 index 000000000..496863d85 --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs @@ -0,0 +1,40 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +namespace MTConnect +{ + /// + /// Builds the MQTT topic on which the MqttRelay agent module + /// publishes the agent's Availability state (the MQTT Last Will and + /// Testament topic plus the on-connect retained Available message). + /// + public static class AvailabilityTopic + { + /// + /// Constant trailing topic segment that names the availability + /// publish. + /// + public const string AvailableSegment = "Available"; + + /// + /// Builds the full MQTT topic the MqttRelay module uses to + /// publish the agent's Availability state. Returns + /// null when either input is null or empty so callers + /// can short-circuit before configuring an MQTT publish. + /// + /// + /// The configured TopicPrefix value, e.g. + /// MTConnect or MTConnect/Document. + /// + /// + /// The agent's Uuid identifier. + /// + public static string Build(string topicPrefix, string agentUuid) + { + if (string.IsNullOrEmpty(topicPrefix)) return null; + if (string.IsNullOrEmpty(agentUuid)) return null; + + return $"{topicPrefix}/{MTConnectMqttDocumentServer.ProbeTopic}/{agentUuid}/{AvailableSegment}"; + } + } +} diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index 06411a474..fdd2add5b 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -737,12 +737,9 @@ private async void AgentAssetAdded(object sender, IAsset asset) private string GetAgentAvailableTopic() { - if (Agent != null && _configuration != null) - { - return $"{_configuration.TopicPrefix}/{MTConnectMqttDocumentServer.ProbeTopic}/{Agent.Uuid}/Available"; ; - } + if (Agent == null || _configuration == null) return null; - return null; + return AvailabilityTopic.Build(_configuration.TopicPrefix, Agent.Uuid); } } } From 0769b255cd1b5947301e8d0ca38b638c231755da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 20:59:33 +0200 Subject: [PATCH 04/29] test(agent-module): add red tests for MqttRelay availability topic Adds tests/MTConnect.NET-AgentModule-MqttRelay-Tests/ NUnit test project covering the agent module's availability topic-construction helper. The fixture pins the corrected topic shape that resolves TrakHound/MTConnect.NET#135: {TopicPrefix}/Agent/{AgentUuid}/Available rather than the broken {TopicPrefix}/Probe/{AgentUuid}/Available shape under the Probe wildcard. The fixture also covers null and empty inputs plus the public AvailableSegment constant so coverage of AvailabilityTopic.cs is 100 percent once the fix flips the Probe segment to Agent. Three of the eight tests are red against the prior commit (the extracted helper still emits the broken topic shape). The fix in the following commit makes them green. Registers the new project in MTConnect.NET.sln so the solution-level build picks it up. --- .../AvailabilityTopicTests.cs | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicTests.cs new file mode 100644 index 000000000..a2df94669 --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicTests.cs @@ -0,0 +1,105 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the MQTT topic on which the MqttRelay agent module publishes + /// the agent's Availability state. Background and motivation: + /// + /// https://github.com/TrakHound/MTConnect.NET/issues/135 + /// + /// The MqttRelay's MQTT Last Will and Testament plus on-connect + /// retained Available message previously emitted on + /// {TopicPrefix}/Probe/{AgentUuid}/Available + /// with a raw "AVAILABLE" / "UNAVAILABLE" UTF-8 string payload. + /// That four-segment topic falls under the + /// {TopicPrefix}/Probe/# + /// wildcard, breaking the contract that every payload under that + /// wildcard is a JSON document envelope. The cppagent reference + /// implementation + /// https://github.com/mtconnect/cppagent + /// publishes only JSON document envelopes under the Probe wildcard + /// and emits the agent availability state outside the wildcard + /// (typically on a dedicated agent / status topic). + /// + /// These tests pin the corrected shape: + /// {TopicPrefix}/Agent/{AgentUuid}/Available + /// so the Probe wildcard remains pure-JSON for any subscriber. + /// + [TestFixture] + public class AvailabilityTopicTests + { + [Test] + public void Build_returns_topic_outside_probe_wildcard() + { + // Arrange / Act + var topic = AvailabilityTopic.Build("MTConnect", "agent-uuid-1"); + + // Assert: never emit under the Probe/# wildcard. + Assert.That(topic, Does.Not.Contain("/Probe/"), + "MqttRelay availability topic must not fall under the Probe/# wildcard."); + } + + [Test] + public void Build_uses_dedicated_agent_segment() + { + // Arrange + const string topicPrefix = "MTConnect"; + const string agentUuid = "agent-uuid-1"; + + // Act + var topic = AvailabilityTopic.Build(topicPrefix, agentUuid); + + // Assert: shape matches the relocated contract. + Assert.That(topic, Is.EqualTo("MTConnect/Agent/agent-uuid-1/Available")); + } + + [Test] + public void Build_preserves_multi_segment_topic_prefix() + { + // Arrange + const string topicPrefix = "MTConnect/Document"; + const string agentUuid = "agent-uuid-2"; + + // Act + var topic = AvailabilityTopic.Build(topicPrefix, agentUuid); + + // Assert + Assert.That(topic, Is.EqualTo("MTConnect/Document/Agent/agent-uuid-2/Available")); + Assert.That(topic, Does.Not.Contain("/Probe/")); + } + + [Test] + public void Build_returns_null_when_topic_prefix_is_null() + { + Assert.That(AvailabilityTopic.Build(null, "agent-uuid-1"), Is.Null); + } + + [Test] + public void Build_returns_null_when_topic_prefix_is_empty() + { + Assert.That(AvailabilityTopic.Build(string.Empty, "agent-uuid-1"), Is.Null); + } + + [Test] + public void Build_returns_null_when_agent_uuid_is_null() + { + Assert.That(AvailabilityTopic.Build("MTConnect", null), Is.Null); + } + + [Test] + public void Build_returns_null_when_agent_uuid_is_empty() + { + Assert.That(AvailabilityTopic.Build("MTConnect", string.Empty), Is.Null); + } + + [Test] + public void AvailableSegment_constant_pins_trailing_topic_segment() + { + Assert.That(AvailabilityTopic.AvailableSegment, Is.EqualTo("Available")); + } + } +} From 308829b43145ad7756e1b67cc2012d4de7949995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 21:02:56 +0200 Subject: [PATCH 05/29] fix(agent-module): correct availability topic emission in MqttRelay Relocates the MqttRelay agent module's MQTT Last Will and Testament topic plus its on-connect retained Available message out of the {TopicPrefix}/Probe/# wildcard. AvailabilityTopic.Build now emits {TopicPrefix}/Agent/{AgentUuid}/Available instead of {TopicPrefix}/Probe/{AgentUuid}/Available. The Probe wildcard therefore carries only JSON document envelopes, restoring the contract a subscriber wildcarding Probe/# relies on (the prior shape crashed JSON parsers on the raw "AVAILABLE" / "UNAVAILABLE" UTF-8 string payload). Closes TrakHound/MTConnect.NET#135 BREAKING CHANGE: operators that subscribed directly to {TopicPrefix}/Probe/{AgentUuid}/Available for the raw availability state must move their subscription to {TopicPrefix}/Agent/{AgentUuid}/Available. --- .../AvailabilityTopic.cs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs index 496863d85..3a7934eea 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs @@ -10,6 +10,15 @@ namespace MTConnect /// public static class AvailabilityTopic { + /// + /// Constant topic segment that separates the agent-availability + /// publishes from the document-envelope publishes (Probe / + /// Current / Sample / Asset). Subscribers wildcarding on + /// {TopicPrefix}/Probe/# therefore never receive the raw + /// availability payload. + /// + public const string AgentSegment = "Agent"; + /// /// Constant trailing topic segment that names the availability /// publish. @@ -34,7 +43,7 @@ public static string Build(string topicPrefix, string agentUuid) if (string.IsNullOrEmpty(topicPrefix)) return null; if (string.IsNullOrEmpty(agentUuid)) return null; - return $"{topicPrefix}/{MTConnectMqttDocumentServer.ProbeTopic}/{agentUuid}/{AvailableSegment}"; + return $"{topicPrefix}/{AgentSegment}/{agentUuid}/{AvailableSegment}"; } } } From a7d1677f467d0b6b677c4cf822446e64cfd26b89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 21:05:10 +0200 Subject: [PATCH 06/29] test(agent-module): pin MqttRelay availability topic regression Adds a regression fixture that guards against re-introduction of the broken availability topic shape resolved in TrakHound/MTConnect.NET#135. The fixture covers a parametric matrix of topic prefixes and agent uuids (including adversarial inputs where the prefix or the uuid is literally the string "Probe") and asserts that the dedicated availability segment stays the AvailabilityTopic.AgentSegment constant rather than collapsing back to the Probe topic constant. --- .../AvailabilityTopicRegressionTests.cs | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicRegressionTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicRegressionTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicRegressionTests.cs new file mode 100644 index 000000000..9099d32e0 --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicRegressionTests.cs @@ -0,0 +1,88 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the regression for + /// https://github.com/TrakHound/MTConnect.NET/issues/135. The + /// MqttRelay agent module previously published its agent + /// availability state (LWT plus on-connect retained Available + /// message) on a four-segment topic under the Probe wildcard with a + /// raw UTF-8 string payload. Any subscriber on + /// {TopicPrefix}/Probe/# parsing every payload as JSON failed on + /// that raw string. + /// + /// These tests guard against accidental re-introduction of the + /// broken shape: any future refactor that re-routes the + /// availability publish under the Probe wildcard or that re-uses + /// the Probe topic constant in availability-topic construction + /// fails the guard. + /// + /// Source references: + /// https://github.com/TrakHound/MTConnect.NET/issues/135 + /// https://github.com/mtconnect/cppagent (canonical Probe/# + /// wildcard contract: only JSON document envelopes). + /// + [TestFixture] + public class AvailabilityTopicRegressionTests + { + [Test] + public void Topic_never_contains_probe_segment_for_any_inputs() + { + string[] topicPrefixes = + { + "MTConnect", + "MTConnect/Document", + "fleet/site/agent-1", + "Probe", // adversarial: prefix happens to be "Probe". + "x" + }; + + string[] agentUuids = + { + "agent-uuid-1", + "Probe", // adversarial: uuid happens to be "Probe". + "00000000-0000-0000-0000-000000000000", + "agent.with.dots", + "agent-with-dashes" + }; + + foreach (var prefix in topicPrefixes) + { + foreach (var uuid in agentUuids) + { + var topic = AvailabilityTopic.Build(prefix, uuid); + + Assert.That(topic, Is.Not.Null, + $"Build({prefix}, {uuid}) should produce a topic."); + + // The dedicated availability segment is "Agent", + // sandwiched between the configured prefix and + // the agent uuid. Even when the prefix or the + // uuid happens to be the literal "Probe" string, + // the dedicated availability segment must never + // be "Probe" so a Probe/# wildcard subscriber + // never matches the availability publish. + var expectedSegment = + $"/{AvailabilityTopic.AgentSegment}/{uuid}/{AvailabilityTopic.AvailableSegment}"; + + Assert.That(topic, Does.EndWith(expectedSegment), + $"Build({prefix}, {uuid}) should end with the relocated availability segment."); + } + } + } + + [Test] + public void Agent_segment_is_distinct_from_probe_constant() + { + // Guards against a future refactor that points + // AvailabilityTopic.AgentSegment back at the Probe topic + // constant (the source of the original defect). + Assert.That(AvailabilityTopic.AgentSegment, Is.Not.EqualTo("Probe")); + Assert.That(AvailabilityTopic.AgentSegment, Is.EqualTo("Agent")); + } + } +} From 6b5abba3ce1cf9efd959a41caca3b1346f80e656 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 21:10:07 +0200 Subject: [PATCH 07/29] docs(testing): document issue-135 phases 02 through 06 Adds per-phase writeups for the red tests (phase 02), library fix (phase 03), regression pins (phase 04), end-to-end validation posture (phase 05), and the closing summary (phase 06) of the MqttRelay availability topic fix. Each writeup records executed work, metrics deltas, deviations from the original plan, and follow-ups. --- docs/testing/issue-135/phase-02-red-tests.md | 78 +++++++++++++ .../testing/issue-135/phase-03-library-fix.md | 103 ++++++++++++++++++ .../issue-135/phase-04-regression-pins.md | 65 +++++++++++ .../issue-135/phase-05-e2e-validation.md | 77 +++++++++++++ docs/testing/issue-135/phase-06-summary.md | 94 ++++++++++++++++ 5 files changed, 417 insertions(+) create mode 100644 docs/testing/issue-135/phase-02-red-tests.md create mode 100644 docs/testing/issue-135/phase-03-library-fix.md create mode 100644 docs/testing/issue-135/phase-04-regression-pins.md create mode 100644 docs/testing/issue-135/phase-05-e2e-validation.md create mode 100644 docs/testing/issue-135/phase-06-summary.md diff --git a/docs/testing/issue-135/phase-02-red-tests.md b/docs/testing/issue-135/phase-02-red-tests.md new file mode 100644 index 000000000..ca2e9d960 --- /dev/null +++ b/docs/testing/issue-135/phase-02-red-tests.md @@ -0,0 +1,78 @@ +# Phase 02 — red tests + +## Executed + +- Created the paired test project + `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` (NUnit 3.13.x, + matching the existing `MTConnect.NET-SHDR-Tests` pattern). The + project references the agent module project directly so the + topic-construction helper extracted in phase 01 is unit-testable. +- Added `AvailabilityTopicTests.cs` with eight tests pinning the + corrected topic shape: + - `Build_returns_topic_outside_probe_wildcard` — guards against the + `Probe/#` wildcard contract violation. + - `Build_uses_dedicated_agent_segment` — pins the exact corrected + topic `MTConnect/Agent/agent-uuid-1/Available`. + - `Build_preserves_multi_segment_topic_prefix` — covers a typical + multi-segment `TopicPrefix` such as `MTConnect/Document`. + - Four null / empty input cases plus the + `AvailableSegment_constant_pins_trailing_topic_segment` constant + test. +- Registered the new project in `MTConnect.NET.sln` so the + solution-level build picks it up. + +## Red state + +Three of the eight tests failed against the prior commit (the +extracted helper still emitted the broken topic shape): + +``` +Failed Build_returns_topic_outside_probe_wildcard +Failed Build_uses_dedicated_agent_segment +Failed Build_preserves_multi_segment_topic_prefix +``` + +The five remaining tests passed (null / empty inputs and the +`AvailableSegment` constant) because those branches are unaffected +by the broken topic shape. The fix in phase 03 makes the three red +tests green without changing the five already-passing ones. + +## Source references + +The fixture cites the public sources required by CONVENTIONS §15: + +- `https://github.com/TrakHound/MTConnect.NET/issues/135` — the + defect report, motivation for the fix. +- `https://github.com/mtconnect/cppagent` — the reference + implementation for the canonical Probe/# wildcard contract + (JSON-only payloads). + +## Metrics delta + +- Test project count: +1. +- Tests under `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/`: +8. + +## Deviations from plan + +- The plan's `03-red-tests.md` cited an `Issue135Red` NUnit category + + an `issue-135-red` inverted-exit-code CI job. CONVENTIONS §1.7 + forbids per-issue CI changes (CI surface is owned by the bootstrap + prelude PR + the tests plan). CONVENTIONS §14 also forbids + internal-only category labels of the + `IssueRed` shape. The category / CI surface is therefore + dropped; the tests are uncategorised and run as part of the + default solution build. The behavior of the gate is preserved: + the three previously-red tests now appear in the standard test + run and surface the failure to any reviewer. +- The plan's `03-red-tests.md` cites a Testcontainers-Mosquitto + end-to-end fixture. The MqttFixture / Testcontainers harness it + references is a bootstrap-plan deliverable that has not landed + on `upstream/master`. The unit-level helper test produces an + equivalent red signal without the infrastructure dependency. + +## Follow-ups + +- E2E coverage of the LWT publish path on a real MQTT broker + remains a follow-up for a future Docker-gated test plan + (CONVENTIONS §12). The unit tests cover the topic-construction + surface only. diff --git a/docs/testing/issue-135/phase-03-library-fix.md b/docs/testing/issue-135/phase-03-library-fix.md new file mode 100644 index 000000000..ec687e8c2 --- /dev/null +++ b/docs/testing/issue-135/phase-03-library-fix.md @@ -0,0 +1,103 @@ +# Phase 03 — library fix + +## Executed + +- Added the `AvailabilityTopic.AgentSegment` constant + (value `"Agent"`). +- Changed `AvailabilityTopic.Build` to interpolate `AgentSegment` + into the produced topic instead of + `MTConnectMqttDocumentServer.ProbeTopic`. The helper now returns + `{TopicPrefix}/Agent/{AgentUuid}/Available` for any non-empty + inputs, satisfying the corrected topic contract. + +`Module.cs:GetAgentAvailableTopic` did not require further edits in +this phase — its phase-01 refactor delegated all topic construction +to `AvailabilityTopic.Build`, so flipping the helper's output +automatically corrects every call site (the LWT configuration on +connect plus the on-connect retained Available message publish). + +## Test state + +``` +Passed! - Failed: 0, Passed: 8, Skipped: 0, Total: 8 +``` + +All eight `AvailabilityTopicTests` are green. No other test project +is touched by this fix. + +## Coverage + +`coverlet.collector` reports 100 percent line and 100 percent +branch coverage on `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs`: + +``` +class name="MTConnect.AvailabilityTopic" +filename="agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs" +line-rate="1" branch-rate="1" complexity="4" +``` + +`Module.cs:GetAgentAvailableTopic` itself is a four-line guarded +delegation: + +```csharp +private string GetAgentAvailableTopic() +{ + if (Agent == null || _configuration == null) return null; + return AvailabilityTopic.Build(_configuration.TopicPrefix, Agent.Uuid); +} +``` + +The function depends on `IMTConnectAgentBroker`, on the MQTT +factory, and on `MqttRelayModuleConfiguration` instances that only +exist inside a running agent process. Unit-level coverage of the +function in isolation is therefore not achievable without +materialising agent infrastructure that does not exist on +`upstream/master` (no integration / Testcontainers harness, no +shared agent mock). The function's body is structurally identical +to the pre-fix body apart from the delegation to the now-tested +helper, so the testable surface (topic construction) is fully +covered. + +## Operator migration + +After this fix: + +- Operators that subscribed to + `{TopicPrefix}/Probe/{AgentUuid}/Available` for the raw + `AVAILABLE` / `UNAVAILABLE` UTF-8 string must move their + subscription to `{TopicPrefix}/Agent/{AgentUuid}/Available`. +- Operators that subscribed to the `{TopicPrefix}/Probe/#` wildcard + for JSON document envelopes can rely on the wildcard delivering + only JSON envelopes with this fix. + +The migration is a one-line subscription change on the operator +side. The publish-side payload format is unchanged +(`AVAILABLE` / `UNAVAILABLE` UTF-8 string, retained, QoS 1). + +## Metrics delta + +- `AvailabilityTopic.cs` coverage: 0 percent (file new in phase 01) + -> 100 percent line / 100 percent branch. +- Number of red tests: 3 -> 0. +- Public API additions: `AvailabilityTopic.AgentSegment` constant. + +## Deviations from plan + +- The plan's `04-library-fix.md` cites a `fix(mqtt-relay): ...` + commit subject. `mqtt-relay` is not a recognised scope under + CONVENTIONS §5.3; the correct scope for production code under + `agent/Modules//` is `agent-module` with the module name + carried in the subject body (CONVENTIONS §5.5). This phase + commits as + `fix(agent-module): correct availability topic emission in MqttRelay`. +- The plan's `04-library-fix.md` also lists removing the + `Issue135Red` category and an `issue-135-red` CI job. Phase 02's + deviation note already documents that those artefacts were never + introduced; nothing to remove. + +## Follow-ups + +- `Module.cs`'s broader uncovered surface (the entire 700-line file + on `upstream/master`) is out of scope for this fix and remains a + follow-up for the dedicated test-coverage plan (CONVENTIONS §10 + end-state target). diff --git a/docs/testing/issue-135/phase-04-regression-pins.md b/docs/testing/issue-135/phase-04-regression-pins.md new file mode 100644 index 000000000..bd25f6989 --- /dev/null +++ b/docs/testing/issue-135/phase-04-regression-pins.md @@ -0,0 +1,65 @@ +# Phase 04 — regression pins + +## Executed + +Added `AvailabilityTopicRegressionTests.cs` to the paired test +project. The fixture pins the corrected behavior with two guards: + +- `Topic_never_contains_probe_segment_for_any_inputs` walks a + parametric matrix of five `topicPrefix` values and five + `agentUuid` values (twenty-five total combinations, including + adversarial inputs where the prefix or the uuid is literally the + string `"Probe"`) and asserts that the produced topic always + ends in + `/{AvailabilityTopic.AgentSegment}/{uuid}/{AvailabilityTopic.AvailableSegment}`. + A future refactor that hard-codes `"/Probe/"` into the + availability publish surface fails this test on every input. +- `Agent_segment_is_distinct_from_probe_constant` pins the + `AgentSegment` constant value (`"Agent"`) and asserts it is + distinct from the literal `"Probe"`. Catches a refactor that + re-routes the constant back at the Probe topic constant. + +## Test state + +``` +Passed! - Failed: 0, Passed: 10, Skipped: 0, Total: 10 +``` + +The test project now ships eight unit tests plus two regression +tests, all green. The regression tests run as part of the default +solution build with no special category or filter. + +## Source references + +The fixture cites the same public sources as the unit tests: + +- `https://github.com/TrakHound/MTConnect.NET/issues/135` +- `https://github.com/mtconnect/cppagent` + +## Metrics delta + +- Tests under `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/`: + +2 (10 total). +- `AvailabilityTopic.cs` coverage: unchanged (already 100 percent). + +## Deviations from plan + +- The plan's `05-regression-pins.md` cited two compliance-suite + files (`tests/Compliance/MTConnect-Compliance-Tests/L5_Regressions/` + and `tests/Compliance/MTConnect-Compliance-E2E/L5_Regressions/`). + Those compliance trees do not exist on `upstream/master` (they + are deliverables of the dedicated tests-overhaul plan that has + not landed). The regression pin therefore lives in the paired + `MTConnect.NET-AgentModule-MqttRelay-Tests/` project where it is + immediately runnable. +- The plan's `05-regression-pins.md` cited a + `docs(tests): migrate issue-135 regression out of compliance-gate + plan` commit. There is no compliance-gate plan file on the + branch's tracked surface to migrate from + (`extra-files.user/plans/11-tests/` is gitignored), so the + migration commit is dropped. + +## Follow-ups + +- Migration of this regression pin into the compliance-suite + L5_Regressions tree once that surface lands upstream. diff --git a/docs/testing/issue-135/phase-05-e2e-validation.md b/docs/testing/issue-135/phase-05-e2e-validation.md new file mode 100644 index 000000000..90d320294 --- /dev/null +++ b/docs/testing/issue-135/phase-05-e2e-validation.md @@ -0,0 +1,77 @@ +# Phase 05 — end-to-end validation + +## Executed + +The plan's E2E phase intended to spin up a Docker-based MQTT broker +(eclipse-mosquitto), run an in-process MqttRelay agent against it, +subscribe to the `{TopicPrefix}/Probe/#` wildcard, and assert that +every received payload parses as JSON. + +The repository on `upstream/master` does not yet ship the harness +this phase relies on: + +- No `tools/test.sh` shared test runner. +- No `MTCONNECT_E2E_DOCKER` toggle in any test project. +- No Testcontainers package reference in the existing + `tests/IntegrationTests/` xUnit project (the only candidate host + for an MQTT E2E fixture). +- No MQTT broker container fixture under any test surface. + +Per CONVENTIONS §18.1 these are out-of-scope additions for this +plan (its declared surface is +`MTConnect.NET-AgentModule-MqttRelay` only). Importing the +Testcontainers + Mosquitto harness would require: + +- A new `PackageReference` for `Testcontainers.Mosquitto` (or an + equivalent), changing the dependency closure of the test surface. +- A Docker availability gate equivalent to + `[Category("RequiresDocker")]` (CONVENTIONS §7) that does not + yet exist on `upstream/master`. +- A CI matrix expansion to gate Docker-only jobs to Ubuntu + (CONVENTIONS §1.7 forbids per-issue CI changes). + +## Coverage of the corrected behavior + +The unit + regression tests in +`tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` cover the +topic-construction surface end-to-end: + +- The corrected topic shape is asserted byte-for-byte. +- The broken topic shape is asserted absent for every adversarial + combination of inputs (twenty-five matrix combinations). +- Null / empty inputs produce a null topic, matching the prior + null-guarded behavior in `Module.cs:GetAgentAvailableTopic`. + +The `Module.cs` call sites (LWT configuration on connect, retained +on-connect Available publish) call `AvailabilityTopic.Build` with +exactly these arguments, so the unit-level guarantee carries +through to the publish surface. + +## Metrics delta + +- E2E test count: 0 (intentionally — see "Executed" above). +- The unit-level matrix in + `AvailabilityTopicRegressionTests.cs` exercises 25 prefix x uuid + combinations. + +## Deviations from plan + +- No Docker-gated MQTT E2E test was added on this branch. The + bootstrap harness the plan depends on + (`tools/test.sh`, `MTCONNECT_E2E_DOCKER`, Testcontainers package + references) has not landed on `upstream/master`. Adding it + inside this plan would expand the declared surface beyond + `MTConnect.NET-AgentModule-MqttRelay/` (CONVENTIONS §18). The + unit + regression coverage of the topic-construction surface + delivers an equivalent functional guarantee for the corrected + topic. + +## Follow-ups + +- Add Docker-gated MQTT E2E coverage of the MqttRelay LWT publish + path once the test infrastructure lands as part of a future + test-coverage / CI-matrix plan (CONVENTIONS §12 end-state + target). +- Wire `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` into the + shared CI workflow once the workflow supports per-project test + jobs (CONVENTIONS §1.7 — owned by the bootstrap / tests plan). diff --git a/docs/testing/issue-135/phase-06-summary.md b/docs/testing/issue-135/phase-06-summary.md new file mode 100644 index 000000000..cfd8de0ed --- /dev/null +++ b/docs/testing/issue-135/phase-06-summary.md @@ -0,0 +1,94 @@ +# Phase 06 — summary + +## Outcome + +Resolves [TrakHound/MTConnect.NET#135](https://github.com/TrakHound/MTConnect.NET/issues/135). + +The MqttRelay agent module now publishes the agent availability +state (MQTT Last Will and Testament + on-connect retained Available +message) on `{TopicPrefix}/Agent/{AgentUuid}/Available` instead of +`{TopicPrefix}/Probe/{AgentUuid}/Available`. The +`{TopicPrefix}/Probe/#` wildcard therefore carries only JSON +document envelopes, matching the cppagent reference contract. + +## Surface touched + +Strictly inside the declared scope +`agent/Modules/MTConnect.NET-AgentModule-MqttRelay/` plus the new +paired test project: + +| Path | Kind | Change | +|-------------------------------------------------------------------------------------------------------|-------------|-------------------------------------| +| `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs` | production | new (helper class + corrected topic) | +| `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs` | production | delegates topic construction to helper | +| `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MTConnect.NET-AgentModule-MqttRelay-Tests.csproj` | test | new project (NUnit 3.13.x) | +| `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicTests.cs` | test | new — eight unit tests | +| `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicRegressionTests.cs` | test | new — two regression tests | +| `MTConnect.NET.sln` | build | registers the new test project | +| `docs/testing/issue-135.md` | docs | new — writeup index | +| `docs/testing/issue-135/phase-00-foundation.md` ... `phase-06-summary.md` | docs | new — per-phase writeups | + +## Validation summary + +- `dotnet build agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MTConnect.NET-AgentModule-MqttRelay.csproj -c Debug` — green. +- `dotnet test tests/MTConnect.NET-AgentModule-MqttRelay-Tests/ -c Debug` — 10 of 10 tests pass. +- `coverlet.collector` reports 100 percent line / 100 percent branch + coverage on `AvailabilityTopic.cs`. `Module.cs:GetAgentAvailableTopic` + itself is a four-line guarded delegation; its testable surface + (the topic-construction logic) is covered through the helper. +- `git status` clean. +- Every commit pushed to `origin/fix/issue-135`. + +## Compliance impact + +The MTConnect Standard does not normatively specify the MQTT topic +on which an agent advertises availability (the standard scopes +the MQTT envelope payload, not the topic-tree layout). This fix +aligns the topic tree with the cppagent reference implementation's +posture — `Probe/#` carries only JSON document envelopes — without +modifying any wire-shape that the standard constrains. + +No L1-L5 compliance row is affected. + +## Breaking change + +Operators that subscribe directly to +`{TopicPrefix}/Probe/{AgentUuid}/Available` for the raw +`AVAILABLE` / `UNAVAILABLE` UTF-8 string must move their +subscription to `{TopicPrefix}/Agent/{AgentUuid}/Available`. The +PR body and the head commit `BREAKING CHANGE:` trailer call this +out. + +## Cross-phase deviations + +The full set of deviations documented per phase, summarised: + +- Commit subjects use the CONVENTIONS §5.3 scopes (`testing`, + `agent-module`) rather than the `issue-135` / `mqtt-relay` scopes + the plan files cited. The latter are not part of the §5.3 + taxonomy. +- The plan's Testcontainers-backed E2E phase is not implemented; + the bootstrap harness it relies on (`tools/test.sh`, the + `MTCONNECT_E2E_DOCKER` toggle, the Testcontainers package + reference) has not landed on `upstream/master`. The unit + + regression coverage of the topic-construction surface delivers + an equivalent functional guarantee. +- The plan's NUnit `[Category("Issue135Red")]` + `issue-135-red` + inverted-exit-code CI job were dropped. CONVENTIONS §1.7 forbids + per-issue CI changes; CONVENTIONS §14 forbids + `IssueRed`-style internal-only category labels. + +All deviations are within the declared scope of +`MTConnect.NET-AgentModule-MqttRelay/` and the paired test project +(CONVENTIONS §18). + +## Follow-ups + +- Once a Docker-gated test infrastructure lands upstream, port the + regression matrix into a real-broker E2E fixture + (CONVENTIONS §12). +- Once the compliance-suite L5_Regressions surface lands upstream, + migrate the regression fixture into that tree. +- Wire `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` into the + CI matrix once the workflow supports per-project test jobs + (CONVENTIONS §1.7). From 420a768b6495064a3bf58b0e7b71f99cd46717c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Sat, 25 Apr 2026 21:15:19 +0200 Subject: [PATCH 08/29] docs(testing): scrub internal path reference from phase 04 writeup Removes a stray reference to a gitignored plans path from the phase 04 regression-pins writeup. The reference resolved only off the public tree. --- docs/testing/issue-135/phase-04-regression-pins.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/testing/issue-135/phase-04-regression-pins.md b/docs/testing/issue-135/phase-04-regression-pins.md index bd25f6989..fea6464eb 100644 --- a/docs/testing/issue-135/phase-04-regression-pins.md +++ b/docs/testing/issue-135/phase-04-regression-pins.md @@ -54,10 +54,10 @@ The fixture cites the same public sources as the unit tests: immediately runnable. - The plan's `05-regression-pins.md` cited a `docs(tests): migrate issue-135 regression out of compliance-gate - plan` commit. There is no compliance-gate plan file on the - branch's tracked surface to migrate from - (`extra-files.user/plans/11-tests/` is gitignored), so the - migration commit is dropped. + plan` commit. The compliance-gate plan it refers to is a + separate (not yet landed) test-overhaul effort and has no + tracked counterpart on this branch's surface, so the migration + commit is dropped. ## Follow-ups From af89c334875b934cb67c213e1c83b08c86ecfcaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 16:11:21 +0200 Subject: [PATCH 09/29] chore(docs): remove internal planning leak from committed tree The docs/testing/issue-135/ subtree carried phase-by-phase campaign writeups that referenced internal tooling (CONVENTIONS rule-book, internal section numbers, extra-files.user/ paths, internal tracker terminology). Those writeups belong in the campaign's gitignored planning area, not in the maintainer-facing public docs tree. --- docs/testing/issue-135.md | 50 --------- docs/testing/issue-135/phase-00-foundation.md | 43 -------- .../issue-135/phase-01-defect-scoping.md | 94 ---------------- docs/testing/issue-135/phase-02-red-tests.md | 78 ------------- .../testing/issue-135/phase-03-library-fix.md | 103 ------------------ .../issue-135/phase-04-regression-pins.md | 65 ----------- .../issue-135/phase-05-e2e-validation.md | 77 ------------- docs/testing/issue-135/phase-06-summary.md | 94 ---------------- 8 files changed, 604 deletions(-) delete mode 100644 docs/testing/issue-135.md delete mode 100644 docs/testing/issue-135/phase-00-foundation.md delete mode 100644 docs/testing/issue-135/phase-01-defect-scoping.md delete mode 100644 docs/testing/issue-135/phase-02-red-tests.md delete mode 100644 docs/testing/issue-135/phase-03-library-fix.md delete mode 100644 docs/testing/issue-135/phase-04-regression-pins.md delete mode 100644 docs/testing/issue-135/phase-05-e2e-validation.md delete mode 100644 docs/testing/issue-135/phase-06-summary.md diff --git a/docs/testing/issue-135.md b/docs/testing/issue-135.md deleted file mode 100644 index 44d919645..000000000 --- a/docs/testing/issue-135.md +++ /dev/null @@ -1,50 +0,0 @@ -# Issue #135 — MqttRelay availability topic emission - -Tracking writeup for the fix to [TrakHound/MTConnect.NET#135](https://github.com/TrakHound/MTConnect.NET/issues/135). - -## 1. Defect and scope - -The `MTConnect.NET-AgentModule-MqttRelay` agent module publishes the -Agent Availability state on `{TopicPrefix}/Probe/{AgentUuid}/Available` -as both the MQTT Last Will and Testament (LWT) and as a retained -`AVAILABLE` / `UNAVAILABLE` UTF-8 string. This breaks the contract -that every topic under `{TopicPrefix}/Probe/#` carries a JSON document -envelope: a subscriber wildcarding on `Probe/#` and parsing every -payload as JSON throws on the raw availability string. - -Scope of this writeup is the `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/` -project only. No other module, library, or test project is in scope. - -## 2. Investigation - -See `docs/testing/issue-135/phase-01-defect-scoping.md`. - -## 3. Red tests - -See `docs/testing/issue-135/phase-02-red-tests.md`. - -## 4. Library fix - -See `docs/testing/issue-135/phase-03-library-fix.md`. - -## 5. Regression pins - -See `docs/testing/issue-135/phase-04-regression-pins.md`. - -## 6. End-to-end validation - -See `docs/testing/issue-135/phase-05-e2e-validation.md`. - -## 7. Summary - -See `docs/testing/issue-135/phase-06-summary.md`. - -## Operator migration - -After this fix, operators that previously subscribed to -`{TopicPrefix}/Probe/{AgentUuid}/Available` for the raw-string -availability state must move their subscription to -`{TopicPrefix}/Agent/{AgentUuid}/Available`. Subscribers of the -`{TopicPrefix}/Probe/#` wildcard will no longer receive the -availability message and can rely on the wildcard delivering only -JSON document envelopes (Probe). diff --git a/docs/testing/issue-135/phase-00-foundation.md b/docs/testing/issue-135/phase-00-foundation.md deleted file mode 100644 index ae05e7fa8..000000000 --- a/docs/testing/issue-135/phase-00-foundation.md +++ /dev/null @@ -1,43 +0,0 @@ -# Phase 00 — foundation - -## Executed - -- Cut branch `fix/issue-135` from `upstream/master` at `3d6321ab Updated ReadMe`. -- Seeded `docs/testing/issue-135.md` with the writeup index and - per-phase placeholders. -- Created `docs/testing/issue-135/` for per-phase writeups. - -## Bootstrap dependency - -The plan author intended to consume bootstrap outputs (shared -`tests/coverlet.runsettings`, `tools/test.sh`, scaffolded test -projects) that have not yet landed on `upstream/master`. This branch -proceeds without those outputs: - -- Validation runs `dotnet build` and `dotnet test` directly against - the touched projects rather than `tools/test.sh`. -- Coverage is measured locally with `coverlet.collector` (already - present in every existing test project's csproj) and - `dotnet test --collect:"XPlat Code Coverage"`. -- The new `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` project - is created as part of this branch (P2) rather than consumed from - bootstrap. - -## Metrics delta - -None at this phase (documentation only). - -## Deviations from plan - -- The plan's `01-foundation.md` cites a `docs(issue-135): seed - writeup skeleton` commit subject. `issue-135` is not a recognised - scope under CONVENTIONS.md §5.3; the correct documentation scope - for `docs/testing//` is `testing`. This phase commits as - `docs(testing): seed issue-135 writeup skeleton`. -- The plan assumed bootstrap had landed. Since it has not, the - branch creates the paired test project itself in P2 instead of - consuming a bootstrap-scaffolded stub. - -## Follow-ups - -- None. diff --git a/docs/testing/issue-135/phase-01-defect-scoping.md b/docs/testing/issue-135/phase-01-defect-scoping.md deleted file mode 100644 index bc413b2d3..000000000 --- a/docs/testing/issue-135/phase-01-defect-scoping.md +++ /dev/null @@ -1,94 +0,0 @@ -# Phase 01 — defect scoping - -## Publish surface - -`agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs` -contains all topic-construction sites that produce the broken -availability publishes: - -- Line 738-746 — `GetAgentAvailableTopic()` returns - `{TopicPrefix}/{ProbeTopic}/{Agent.Uuid}/Available` where - `ProbeTopic` resolves to the constant `Probe`. Result: - `{TopicPrefix}/Probe/{AgentUuid}/Available`. -- Line 119-123 — `clientOptionsBuilder.WithWillTopic(GetAgentAvailableTopic())` - configures the MQTT Last Will and Testament. Payload is - `Availability.UNAVAILABLE.ToString()` (raw UTF-8 bytes), QoS 1, - retained. -- Line 417-449 — when the `Agent` device's Probe document is - published, the same `GetAgentAvailableTopic()` is republished as - a retained message with payload `Availability.AVAILABLE.ToString()` - (raw UTF-8 bytes), QoS 1, retained. - -`MqttTopicStructure.cs` declares topic prefix constants but does -not contribute additional topic-construction sites. - -## cppagent reference - -The reference `mtconnect/cppagent` implementation does not publish a -dedicated raw-string availability topic under the Probe wildcard. -Agent availability is communicated via: - -- The MQTT Last Will and Testament fires a transport-level - notification on a topic outside the document wildcards (typical - layout: a status / agent topic, never under `Probe/#`). -- The normal observation flow encodes the `AVAILABILITY` DataItem - inside the JSON `Current` envelope when the agent is reachable. - -A subscriber wildcarding on `{TopicPrefix}/Probe/#` consequently -receives only JSON document envelopes from the cppagent reference. - -## Strategy decision - -**Strategy A — relocate the topic out of the Probe wildcard.** - -The fix relocates both the LWT and the retained message from -`{TopicPrefix}/Probe/{AgentUuid}/Available` to -`{TopicPrefix}/Agent/{AgentUuid}/Available`. The `Probe/#` wildcard -then carries only JSON document envelopes, matching the cppagent -contract. - -Strategy A was selected over the alternatives because: - -- Strategy B (JSON-wrap the payload in place) keeps the four-segment - topic shape under `Probe/#` which still violates the cppagent - three-segment Probe contract. Partial compliance only. -- Strategy C (eliminate the dedicated publish, rely on the - `AVAILABILITY` DataItem in the Current envelope) loses the MQTT - transport-level Last Will signal that operators rely on for fast - detection of unclean disconnects. - -## Breaking-change impact - -Strategy A is a breaking change for any operator that subscribes -directly to `{TopicPrefix}/Probe/{AgentUuid}/Available`. The fix -ships with an operator migration note in `docs/testing/issue-135.md` -and a `Breaking:` bullet in the PR body. - -A repo-wide grep for the legacy topic shape finds no in-repo -consumers (no examples, no fixtures, no documentation) — the -breaking surface is external to this repository. - -## Sources - -- `https://github.com/TrakHound/MTConnect.NET/issues/135` — defect - report describing the failed `Probe/#` JSON parse. -- `https://github.com/mtconnect/cppagent` — reference implementation - for the canonical MQTT topic layout under `Probe/#`. - -## Metrics delta - -None at this phase (documentation only). - -## Deviations from plan - -- The plan's `02-defect-scoping.md` cites a `docs(issue-135): scope - mqtt-relay availability topic defect` commit subject. `issue-135` - is not a recognised scope under CONVENTIONS.md §5.3; the correct - documentation scope is `testing` (per §5.4 rule 5: docs about a - feature use the feature's scope, but the writeups under - `docs/testing//` are plan-tracking artefacts and use the - `testing` documentation scope). - -## Follow-ups - -- None. diff --git a/docs/testing/issue-135/phase-02-red-tests.md b/docs/testing/issue-135/phase-02-red-tests.md deleted file mode 100644 index ca2e9d960..000000000 --- a/docs/testing/issue-135/phase-02-red-tests.md +++ /dev/null @@ -1,78 +0,0 @@ -# Phase 02 — red tests - -## Executed - -- Created the paired test project - `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` (NUnit 3.13.x, - matching the existing `MTConnect.NET-SHDR-Tests` pattern). The - project references the agent module project directly so the - topic-construction helper extracted in phase 01 is unit-testable. -- Added `AvailabilityTopicTests.cs` with eight tests pinning the - corrected topic shape: - - `Build_returns_topic_outside_probe_wildcard` — guards against the - `Probe/#` wildcard contract violation. - - `Build_uses_dedicated_agent_segment` — pins the exact corrected - topic `MTConnect/Agent/agent-uuid-1/Available`. - - `Build_preserves_multi_segment_topic_prefix` — covers a typical - multi-segment `TopicPrefix` such as `MTConnect/Document`. - - Four null / empty input cases plus the - `AvailableSegment_constant_pins_trailing_topic_segment` constant - test. -- Registered the new project in `MTConnect.NET.sln` so the - solution-level build picks it up. - -## Red state - -Three of the eight tests failed against the prior commit (the -extracted helper still emitted the broken topic shape): - -``` -Failed Build_returns_topic_outside_probe_wildcard -Failed Build_uses_dedicated_agent_segment -Failed Build_preserves_multi_segment_topic_prefix -``` - -The five remaining tests passed (null / empty inputs and the -`AvailableSegment` constant) because those branches are unaffected -by the broken topic shape. The fix in phase 03 makes the three red -tests green without changing the five already-passing ones. - -## Source references - -The fixture cites the public sources required by CONVENTIONS §15: - -- `https://github.com/TrakHound/MTConnect.NET/issues/135` — the - defect report, motivation for the fix. -- `https://github.com/mtconnect/cppagent` — the reference - implementation for the canonical Probe/# wildcard contract - (JSON-only payloads). - -## Metrics delta - -- Test project count: +1. -- Tests under `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/`: +8. - -## Deviations from plan - -- The plan's `03-red-tests.md` cited an `Issue135Red` NUnit category - + an `issue-135-red` inverted-exit-code CI job. CONVENTIONS §1.7 - forbids per-issue CI changes (CI surface is owned by the bootstrap - prelude PR + the tests plan). CONVENTIONS §14 also forbids - internal-only category labels of the - `IssueRed` shape. The category / CI surface is therefore - dropped; the tests are uncategorised and run as part of the - default solution build. The behavior of the gate is preserved: - the three previously-red tests now appear in the standard test - run and surface the failure to any reviewer. -- The plan's `03-red-tests.md` cites a Testcontainers-Mosquitto - end-to-end fixture. The MqttFixture / Testcontainers harness it - references is a bootstrap-plan deliverable that has not landed - on `upstream/master`. The unit-level helper test produces an - equivalent red signal without the infrastructure dependency. - -## Follow-ups - -- E2E coverage of the LWT publish path on a real MQTT broker - remains a follow-up for a future Docker-gated test plan - (CONVENTIONS §12). The unit tests cover the topic-construction - surface only. diff --git a/docs/testing/issue-135/phase-03-library-fix.md b/docs/testing/issue-135/phase-03-library-fix.md deleted file mode 100644 index ec687e8c2..000000000 --- a/docs/testing/issue-135/phase-03-library-fix.md +++ /dev/null @@ -1,103 +0,0 @@ -# Phase 03 — library fix - -## Executed - -- Added the `AvailabilityTopic.AgentSegment` constant - (value `"Agent"`). -- Changed `AvailabilityTopic.Build` to interpolate `AgentSegment` - into the produced topic instead of - `MTConnectMqttDocumentServer.ProbeTopic`. The helper now returns - `{TopicPrefix}/Agent/{AgentUuid}/Available` for any non-empty - inputs, satisfying the corrected topic contract. - -`Module.cs:GetAgentAvailableTopic` did not require further edits in -this phase — its phase-01 refactor delegated all topic construction -to `AvailabilityTopic.Build`, so flipping the helper's output -automatically corrects every call site (the LWT configuration on -connect plus the on-connect retained Available message publish). - -## Test state - -``` -Passed! - Failed: 0, Passed: 8, Skipped: 0, Total: 8 -``` - -All eight `AvailabilityTopicTests` are green. No other test project -is touched by this fix. - -## Coverage - -`coverlet.collector` reports 100 percent line and 100 percent -branch coverage on `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs`: - -``` -class name="MTConnect.AvailabilityTopic" -filename="agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs" -line-rate="1" branch-rate="1" complexity="4" -``` - -`Module.cs:GetAgentAvailableTopic` itself is a four-line guarded -delegation: - -```csharp -private string GetAgentAvailableTopic() -{ - if (Agent == null || _configuration == null) return null; - return AvailabilityTopic.Build(_configuration.TopicPrefix, Agent.Uuid); -} -``` - -The function depends on `IMTConnectAgentBroker`, on the MQTT -factory, and on `MqttRelayModuleConfiguration` instances that only -exist inside a running agent process. Unit-level coverage of the -function in isolation is therefore not achievable without -materialising agent infrastructure that does not exist on -`upstream/master` (no integration / Testcontainers harness, no -shared agent mock). The function's body is structurally identical -to the pre-fix body apart from the delegation to the now-tested -helper, so the testable surface (topic construction) is fully -covered. - -## Operator migration - -After this fix: - -- Operators that subscribed to - `{TopicPrefix}/Probe/{AgentUuid}/Available` for the raw - `AVAILABLE` / `UNAVAILABLE` UTF-8 string must move their - subscription to `{TopicPrefix}/Agent/{AgentUuid}/Available`. -- Operators that subscribed to the `{TopicPrefix}/Probe/#` wildcard - for JSON document envelopes can rely on the wildcard delivering - only JSON envelopes with this fix. - -The migration is a one-line subscription change on the operator -side. The publish-side payload format is unchanged -(`AVAILABLE` / `UNAVAILABLE` UTF-8 string, retained, QoS 1). - -## Metrics delta - -- `AvailabilityTopic.cs` coverage: 0 percent (file new in phase 01) - -> 100 percent line / 100 percent branch. -- Number of red tests: 3 -> 0. -- Public API additions: `AvailabilityTopic.AgentSegment` constant. - -## Deviations from plan - -- The plan's `04-library-fix.md` cites a `fix(mqtt-relay): ...` - commit subject. `mqtt-relay` is not a recognised scope under - CONVENTIONS §5.3; the correct scope for production code under - `agent/Modules//` is `agent-module` with the module name - carried in the subject body (CONVENTIONS §5.5). This phase - commits as - `fix(agent-module): correct availability topic emission in MqttRelay`. -- The plan's `04-library-fix.md` also lists removing the - `Issue135Red` category and an `issue-135-red` CI job. Phase 02's - deviation note already documents that those artefacts were never - introduced; nothing to remove. - -## Follow-ups - -- `Module.cs`'s broader uncovered surface (the entire 700-line file - on `upstream/master`) is out of scope for this fix and remains a - follow-up for the dedicated test-coverage plan (CONVENTIONS §10 - end-state target). diff --git a/docs/testing/issue-135/phase-04-regression-pins.md b/docs/testing/issue-135/phase-04-regression-pins.md deleted file mode 100644 index fea6464eb..000000000 --- a/docs/testing/issue-135/phase-04-regression-pins.md +++ /dev/null @@ -1,65 +0,0 @@ -# Phase 04 — regression pins - -## Executed - -Added `AvailabilityTopicRegressionTests.cs` to the paired test -project. The fixture pins the corrected behavior with two guards: - -- `Topic_never_contains_probe_segment_for_any_inputs` walks a - parametric matrix of five `topicPrefix` values and five - `agentUuid` values (twenty-five total combinations, including - adversarial inputs where the prefix or the uuid is literally the - string `"Probe"`) and asserts that the produced topic always - ends in - `/{AvailabilityTopic.AgentSegment}/{uuid}/{AvailabilityTopic.AvailableSegment}`. - A future refactor that hard-codes `"/Probe/"` into the - availability publish surface fails this test on every input. -- `Agent_segment_is_distinct_from_probe_constant` pins the - `AgentSegment` constant value (`"Agent"`) and asserts it is - distinct from the literal `"Probe"`. Catches a refactor that - re-routes the constant back at the Probe topic constant. - -## Test state - -``` -Passed! - Failed: 0, Passed: 10, Skipped: 0, Total: 10 -``` - -The test project now ships eight unit tests plus two regression -tests, all green. The regression tests run as part of the default -solution build with no special category or filter. - -## Source references - -The fixture cites the same public sources as the unit tests: - -- `https://github.com/TrakHound/MTConnect.NET/issues/135` -- `https://github.com/mtconnect/cppagent` - -## Metrics delta - -- Tests under `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/`: - +2 (10 total). -- `AvailabilityTopic.cs` coverage: unchanged (already 100 percent). - -## Deviations from plan - -- The plan's `05-regression-pins.md` cited two compliance-suite - files (`tests/Compliance/MTConnect-Compliance-Tests/L5_Regressions/` - and `tests/Compliance/MTConnect-Compliance-E2E/L5_Regressions/`). - Those compliance trees do not exist on `upstream/master` (they - are deliverables of the dedicated tests-overhaul plan that has - not landed). The regression pin therefore lives in the paired - `MTConnect.NET-AgentModule-MqttRelay-Tests/` project where it is - immediately runnable. -- The plan's `05-regression-pins.md` cited a - `docs(tests): migrate issue-135 regression out of compliance-gate - plan` commit. The compliance-gate plan it refers to is a - separate (not yet landed) test-overhaul effort and has no - tracked counterpart on this branch's surface, so the migration - commit is dropped. - -## Follow-ups - -- Migration of this regression pin into the compliance-suite - L5_Regressions tree once that surface lands upstream. diff --git a/docs/testing/issue-135/phase-05-e2e-validation.md b/docs/testing/issue-135/phase-05-e2e-validation.md deleted file mode 100644 index 90d320294..000000000 --- a/docs/testing/issue-135/phase-05-e2e-validation.md +++ /dev/null @@ -1,77 +0,0 @@ -# Phase 05 — end-to-end validation - -## Executed - -The plan's E2E phase intended to spin up a Docker-based MQTT broker -(eclipse-mosquitto), run an in-process MqttRelay agent against it, -subscribe to the `{TopicPrefix}/Probe/#` wildcard, and assert that -every received payload parses as JSON. - -The repository on `upstream/master` does not yet ship the harness -this phase relies on: - -- No `tools/test.sh` shared test runner. -- No `MTCONNECT_E2E_DOCKER` toggle in any test project. -- No Testcontainers package reference in the existing - `tests/IntegrationTests/` xUnit project (the only candidate host - for an MQTT E2E fixture). -- No MQTT broker container fixture under any test surface. - -Per CONVENTIONS §18.1 these are out-of-scope additions for this -plan (its declared surface is -`MTConnect.NET-AgentModule-MqttRelay` only). Importing the -Testcontainers + Mosquitto harness would require: - -- A new `PackageReference` for `Testcontainers.Mosquitto` (or an - equivalent), changing the dependency closure of the test surface. -- A Docker availability gate equivalent to - `[Category("RequiresDocker")]` (CONVENTIONS §7) that does not - yet exist on `upstream/master`. -- A CI matrix expansion to gate Docker-only jobs to Ubuntu - (CONVENTIONS §1.7 forbids per-issue CI changes). - -## Coverage of the corrected behavior - -The unit + regression tests in -`tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` cover the -topic-construction surface end-to-end: - -- The corrected topic shape is asserted byte-for-byte. -- The broken topic shape is asserted absent for every adversarial - combination of inputs (twenty-five matrix combinations). -- Null / empty inputs produce a null topic, matching the prior - null-guarded behavior in `Module.cs:GetAgentAvailableTopic`. - -The `Module.cs` call sites (LWT configuration on connect, retained -on-connect Available publish) call `AvailabilityTopic.Build` with -exactly these arguments, so the unit-level guarantee carries -through to the publish surface. - -## Metrics delta - -- E2E test count: 0 (intentionally — see "Executed" above). -- The unit-level matrix in - `AvailabilityTopicRegressionTests.cs` exercises 25 prefix x uuid - combinations. - -## Deviations from plan - -- No Docker-gated MQTT E2E test was added on this branch. The - bootstrap harness the plan depends on - (`tools/test.sh`, `MTCONNECT_E2E_DOCKER`, Testcontainers package - references) has not landed on `upstream/master`. Adding it - inside this plan would expand the declared surface beyond - `MTConnect.NET-AgentModule-MqttRelay/` (CONVENTIONS §18). The - unit + regression coverage of the topic-construction surface - delivers an equivalent functional guarantee for the corrected - topic. - -## Follow-ups - -- Add Docker-gated MQTT E2E coverage of the MqttRelay LWT publish - path once the test infrastructure lands as part of a future - test-coverage / CI-matrix plan (CONVENTIONS §12 end-state - target). -- Wire `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` into the - shared CI workflow once the workflow supports per-project test - jobs (CONVENTIONS §1.7 — owned by the bootstrap / tests plan). diff --git a/docs/testing/issue-135/phase-06-summary.md b/docs/testing/issue-135/phase-06-summary.md deleted file mode 100644 index cfd8de0ed..000000000 --- a/docs/testing/issue-135/phase-06-summary.md +++ /dev/null @@ -1,94 +0,0 @@ -# Phase 06 — summary - -## Outcome - -Resolves [TrakHound/MTConnect.NET#135](https://github.com/TrakHound/MTConnect.NET/issues/135). - -The MqttRelay agent module now publishes the agent availability -state (MQTT Last Will and Testament + on-connect retained Available -message) on `{TopicPrefix}/Agent/{AgentUuid}/Available` instead of -`{TopicPrefix}/Probe/{AgentUuid}/Available`. The -`{TopicPrefix}/Probe/#` wildcard therefore carries only JSON -document envelopes, matching the cppagent reference contract. - -## Surface touched - -Strictly inside the declared scope -`agent/Modules/MTConnect.NET-AgentModule-MqttRelay/` plus the new -paired test project: - -| Path | Kind | Change | -|-------------------------------------------------------------------------------------------------------|-------------|-------------------------------------| -| `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs` | production | new (helper class + corrected topic) | -| `agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs` | production | delegates topic construction to helper | -| `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MTConnect.NET-AgentModule-MqttRelay-Tests.csproj` | test | new project (NUnit 3.13.x) | -| `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicTests.cs` | test | new — eight unit tests | -| `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicRegressionTests.cs` | test | new — two regression tests | -| `MTConnect.NET.sln` | build | registers the new test project | -| `docs/testing/issue-135.md` | docs | new — writeup index | -| `docs/testing/issue-135/phase-00-foundation.md` ... `phase-06-summary.md` | docs | new — per-phase writeups | - -## Validation summary - -- `dotnet build agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MTConnect.NET-AgentModule-MqttRelay.csproj -c Debug` — green. -- `dotnet test tests/MTConnect.NET-AgentModule-MqttRelay-Tests/ -c Debug` — 10 of 10 tests pass. -- `coverlet.collector` reports 100 percent line / 100 percent branch - coverage on `AvailabilityTopic.cs`. `Module.cs:GetAgentAvailableTopic` - itself is a four-line guarded delegation; its testable surface - (the topic-construction logic) is covered through the helper. -- `git status` clean. -- Every commit pushed to `origin/fix/issue-135`. - -## Compliance impact - -The MTConnect Standard does not normatively specify the MQTT topic -on which an agent advertises availability (the standard scopes -the MQTT envelope payload, not the topic-tree layout). This fix -aligns the topic tree with the cppagent reference implementation's -posture — `Probe/#` carries only JSON document envelopes — without -modifying any wire-shape that the standard constrains. - -No L1-L5 compliance row is affected. - -## Breaking change - -Operators that subscribe directly to -`{TopicPrefix}/Probe/{AgentUuid}/Available` for the raw -`AVAILABLE` / `UNAVAILABLE` UTF-8 string must move their -subscription to `{TopicPrefix}/Agent/{AgentUuid}/Available`. The -PR body and the head commit `BREAKING CHANGE:` trailer call this -out. - -## Cross-phase deviations - -The full set of deviations documented per phase, summarised: - -- Commit subjects use the CONVENTIONS §5.3 scopes (`testing`, - `agent-module`) rather than the `issue-135` / `mqtt-relay` scopes - the plan files cited. The latter are not part of the §5.3 - taxonomy. -- The plan's Testcontainers-backed E2E phase is not implemented; - the bootstrap harness it relies on (`tools/test.sh`, the - `MTCONNECT_E2E_DOCKER` toggle, the Testcontainers package - reference) has not landed on `upstream/master`. The unit + - regression coverage of the topic-construction surface delivers - an equivalent functional guarantee. -- The plan's NUnit `[Category("Issue135Red")]` + `issue-135-red` - inverted-exit-code CI job were dropped. CONVENTIONS §1.7 forbids - per-issue CI changes; CONVENTIONS §14 forbids - `IssueRed`-style internal-only category labels. - -All deviations are within the declared scope of -`MTConnect.NET-AgentModule-MqttRelay/` and the paired test project -(CONVENTIONS §18). - -## Follow-ups - -- Once a Docker-gated test infrastructure lands upstream, port the - regression matrix into a real-broker E2E fixture - (CONVENTIONS §12). -- Once the compliance-suite L5_Regressions surface lands upstream, - migrate the regression fixture into that tree. -- Wire `tests/MTConnect.NET-AgentModule-MqttRelay-Tests/` into the - CI matrix once the workflow supports per-project test jobs - (CONVENTIONS §1.7). From 2e3abe68fa9a3078621b93737e5f065b4f0d66f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 20:30:03 +0200 Subject: [PATCH 10/29] test(agent-module): pin MqttRelay AvailabilityTopic input validation Add NUnit coverage for the MQTT 3.1.1 reserved-character rules (section 4.7.1.1) on AvailabilityTopic.Build inputs, plus canonicalisation of leading and trailing slashes in the topicPrefix. The current Build implementation does not validate or canonicalise its inputs, so the new tests fail and pin the corrected contract. --- .../AvailabilityTopicValidationTests.cs | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicValidationTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicValidationTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicValidationTests.cs new file mode 100644 index 000000000..d4b2ca12f --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AvailabilityTopicValidationTests.cs @@ -0,0 +1,78 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins MQTT topic input validation for + /// against the + /// MQTT 3.1.1 reserved-character rules. The MQTT specification (3.1.1 + /// section 4.7.1.1) reserves '+', '#', and the null character within + /// topic names; '/' is the topic separator and must not appear inside + /// a single segment such as a topicPrefix or agentUuid input. + /// + /// If a caller supplies a value containing any of those reserved + /// characters the resulting topic would be malformed (or would alter + /// the wildcard contract under which other subscribers operate), so + /// rejects the + /// input by returning null. + /// + /// Source reference: + /// MQTT 3.1.1, OASIS standard, section 4.7.1.1 (topic name and + /// topic filter format). + /// + [TestFixture] + public class AvailabilityTopicValidationTests + { + [TestCase("MTConnect+", "agent-uuid-1")] + [TestCase("MTConnect#", "agent-uuid-1")] + [TestCase("MTConnect\0", "agent-uuid-1")] + [TestCase("MTConnect", "agent+uuid")] + [TestCase("MTConnect", "agent#uuid")] + [TestCase("MTConnect", "agent\0uuid")] + public void Build_returns_null_when_reserved_character_present( + string topicPrefix, string agentUuid) + { + Assert.That( + AvailabilityTopic.Build(topicPrefix, agentUuid), + Is.Null, + $"Build({topicPrefix}, {agentUuid}) should reject MQTT-reserved characters."); + } + + [Test] + public void Build_rejects_slash_inside_agent_uuid_segment() + { + // The agentUuid is a single topic segment; embedding '/' + // would split it across multiple segments and break the + // {TopicPrefix}/Agent/{AgentUuid}/Available shape. + Assert.That( + AvailabilityTopic.Build("MTConnect", "agent/uuid"), + Is.Null); + } + + [Test] + public void Build_strips_leading_slash_from_topic_prefix() + { + // Optional canonicalisation: a leading '/' in topicPrefix + // would yield a topic beginning with '/', which is legal in + // MQTT 3.1.1 but produces a confusing empty leading + // segment. Strip it so the resulting topic stays canonical. + Assert.That( + AvailabilityTopic.Build("/MTConnect", "agent-1"), + Is.EqualTo("MTConnect/Agent/agent-1/Available")); + } + + [Test] + public void Build_strips_trailing_slash_from_topic_prefix() + { + // Optional canonicalisation: a trailing '/' would yield + // "MTConnect//Agent/agent-1/Available" with a stray empty + // segment. + Assert.That( + AvailabilityTopic.Build("MTConnect/", "agent-1"), + Is.EqualTo("MTConnect/Agent/agent-1/Available")); + } + } +} From 49e021f7144a4b56b0909a11c0f52f19ebd806fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 20:40:08 +0200 Subject: [PATCH 11/29] fix(agent-module): validate MqttRelay AvailabilityTopic inputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reject MQTT-reserved characters ('+', '#', '\0') in topicPrefix and agentUuid per MQTT 3.1.1 §4.7.1.1; reject '/' inside agentUuid since it is a single topic segment; canonicalise leading and trailing '/' in topicPrefix so the resulting topic stays canonical and never emits a stray empty segment. --- .../AvailabilityTopic.cs | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs index 3a7934eea..bbdae8cb0 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AvailabilityTopic.cs @@ -28,22 +28,61 @@ public static class AvailabilityTopic /// /// Builds the full MQTT topic the MqttRelay module uses to /// publish the agent's Availability state. Returns - /// null when either input is null or empty so callers + /// null when either input is null or empty, or when + /// either input contains an MQTT-reserved character so callers /// can short-circuit before configuring an MQTT publish. /// /// /// The configured TopicPrefix value, e.g. - /// MTConnect or MTConnect/Document. + /// MTConnect or MTConnect/Document. Leading and + /// trailing / separators are stripped so the resulting + /// topic stays canonical. /// /// - /// The agent's Uuid identifier. + /// The agent's Uuid identifier. The agentUuid is a + /// single topic segment so it must not contain /. /// + /// + /// Per MQTT 3.1.1 OASIS standard section 4.7.1.1, MQTT topic + /// names must not contain the wildcard characters + or + /// # nor the null character \0; supplying any of + /// those characters in either input produces a null + /// return. + /// public static string Build(string topicPrefix, string agentUuid) { if (string.IsNullOrEmpty(topicPrefix)) return null; if (string.IsNullOrEmpty(agentUuid)) return null; - return $"{topicPrefix}/{AgentSegment}/{agentUuid}/{AvailableSegment}"; + // MQTT 3.1.1 §4.7.1.1: '+' and '#' are wildcard characters + // and the null character is reserved. None of them are + // legal inside a topic name. + if (ContainsReservedCharacter(topicPrefix)) return null; + if (ContainsReservedCharacter(agentUuid)) return null; + + // The agentUuid is a single topic segment so it must not + // embed a '/'; otherwise the {TopicPrefix}/Agent/{AgentUuid} + // /Available shape silently fragments across additional + // segments. + if (agentUuid.IndexOf('/') >= 0) return null; + + // Canonicalise the prefix: strip a leading or trailing '/' + // so the resulting topic does not begin with '/' nor + // contain a stray empty segment. + var canonicalPrefix = topicPrefix.Trim('/'); + if (canonicalPrefix.Length == 0) return null; + + return $"{canonicalPrefix}/{AgentSegment}/{agentUuid}/{AvailableSegment}"; + } + + private static bool ContainsReservedCharacter(string value) + { + for (var i = 0; i < value.Length; i++) + { + var c = value[i]; + if (c == '+' || c == '#' || c == '\0') return true; + } + return false; } } } From 4d253fa637cdeaccd5a8510f74fa0251dffa7026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 20:51:25 +0200 Subject: [PATCH 12/29] test(agent-module): pin MqttRelay missed-observation underflow guard Add NUnit coverage for a RelayBufferDiagnostics.ComputeMissed helper that exposes the missed-observations diagnostic computation in RelayBufferedObservations. The Module currently inlines long missed = (long)(to - lastSent); which underflows when lastSent > to. The new helper does not yet exist so the tests fail and pin the corrected contract. --- .../RelayBufferDiagnosticsTests.cs | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/RelayBufferDiagnosticsTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/RelayBufferDiagnosticsTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/RelayBufferDiagnosticsTests.cs new file mode 100644 index 000000000..2c28d15aa --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/RelayBufferDiagnosticsTests.cs @@ -0,0 +1,71 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the relay-buffer "missed observations" diagnostic + /// computation. The previous Module implementation computed + /// long missed = (long)(to - lastSent); + /// using ulong arithmetic. When lastSent > to (a + /// degenerate but possible state, for example when a persisted + /// last-sent-sequence file is stale relative to a freshly-restarted + /// broker whose sequence has rolled), the unsigned subtraction + /// underflows and the cast to long produces a huge spurious + /// "missed" figure in the diagnostic log line. + /// + /// The corrected helper computes missed only when + /// lastSent <= to; otherwise it returns 0 so the + /// diagnostic stays meaningful and the operator does not see + /// nonsense numbers. + /// + [TestFixture] + public class RelayBufferDiagnosticsTests + { + [Test] + public void ComputeMissed_returns_zero_when_last_sent_above_to() + { + Assert.That( + RelayBufferDiagnostics.ComputeMissed(to: 5UL, lastSent: 10UL), + Is.Zero, + "Underflow guard: when lastSent > to, missed must be 0."); + } + + [Test] + public void ComputeMissed_returns_zero_when_last_sent_equals_to() + { + Assert.That( + RelayBufferDiagnostics.ComputeMissed(to: 5UL, lastSent: 5UL), + Is.Zero); + } + + [Test] + public void ComputeMissed_returns_difference_when_last_sent_below_to() + { + Assert.That( + RelayBufferDiagnostics.ComputeMissed(to: 100UL, lastSent: 25UL), + Is.EqualTo(75L)); + } + + [Test] + public void ComputeMissed_handles_zero_last_sent() + { + Assert.That( + RelayBufferDiagnostics.ComputeMissed(to: 100UL, lastSent: 0UL), + Is.EqualTo(100L)); + } + + [Test] + public void ComputeMissed_round_trips_long_max_value_difference() + { + // A 63-bit-fitting positive difference must round-trip + // through the long return type without sign loss. + ulong to = (ulong)long.MaxValue; + Assert.That( + RelayBufferDiagnostics.ComputeMissed(to, lastSent: 0UL), + Is.EqualTo(long.MaxValue)); + } + } +} From e60c4be549ce51c9cda68fc0b81e60439db79867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 20:59:31 +0200 Subject: [PATCH 13/29] fix(agent-module): guard MqttRelay missed-observation underflow Extract the missed-observation diagnostic into RelayBufferDiagnostics.ComputeMissed and route the Module through the helper. ComputeMissed returns 0 when lastSent >= to so a stale last-sent-sequence file or a rolled broker sequence no longer produces a huge spurious "missed" figure in the diagnostic log. --- .../Module.cs | 2 +- .../RelayBufferDiagnostics.cs | 36 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/RelayBufferDiagnostics.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index fdd2add5b..eb454740f 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -279,7 +279,7 @@ private async Task RelayBufferedObservations() Log(MTConnectLogLevel.Information, $"[MQTT Relay] RelayBufferedObservations: lastSent={lastSent}, broker.FirstSequence={broker.FirstSequence}, broker.LastSequence={broker.LastSequence}"); - long missed = (long)(to - lastSent); + long missed = RelayBufferDiagnostics.ComputeMissed(to, lastSent); if (lastSent + 1 < broker.FirstSequence) { diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/RelayBufferDiagnostics.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/RelayBufferDiagnostics.cs new file mode 100644 index 000000000..d612f6b92 --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/RelayBufferDiagnostics.cs @@ -0,0 +1,36 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +namespace MTConnect +{ + /// + /// Helpers for the MqttRelay buffered-observation diagnostic log + /// lines. The "missed observations" figure printed when a relay + /// reconnects is computed against unsigned broker sequence numbers, + /// so the helper guards the unsigned subtraction from underflow. + /// + public static class RelayBufferDiagnostics + { + /// + /// Computes the count of buffered observations the operator + /// reasonably expects to see relayed when the connection + /// recovers. Returns 0 when + /// is at or above ; that case is + /// degenerate (a stale persisted last-sent-sequence value or a + /// rolled broker sequence) so a meaningful "missed" figure + /// cannot be produced and the diagnostic should not print a + /// huge spurious number. + /// + /// + /// The broker's current LastSequence. + /// + /// + /// The persisted last-sent-sequence. + /// + public static long ComputeMissed(ulong to, ulong lastSent) + { + if (lastSent >= to) return 0; + return (long)(to - lastSent); + } + } +} From 9a6605cd3afda8ae543921ed1e5b1a05dec04a40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 21:57:53 +0200 Subject: [PATCH 14/29] docs(agent-module): document MqttRelay availability topic move The MqttRelay agent module used to publish agent Availability under the {TopicPrefix}/Probe/# wildcard (specifically {TopicPrefix}/Probe/{AgentUuid}/Available with a raw UTF-8 string payload). That broke the wildcard's pure-JSON contract for any subscriber binding {TopicPrefix}/Probe/#. The module now publishes that same Availability state on a dedicated {TopicPrefix}/Agent/{AgentUuid}/Available topic, matching the cppagent reference implementation. Add a Migration section to the NuGet README so operators upgrading the package see the topic move and the subscriber action required, rather than only the raw API-shape contract pinned by the unit tests. --- .../README-Nuget.md | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/README-Nuget.md b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/README-Nuget.md index a1e2f8c8c..688bb08b6 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/README-Nuget.md +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/README-Nuget.md @@ -105,6 +105,43 @@ modules: topicPrefix: enterprise/site/area/line/cell/MTConnect ``` +## Migration: Availability Topic Moved Out of `Probe/#` + +In earlier releases the MqttRelay agent module published the agent's +Availability state (Last Will and Testament plus the on-connect retained +"AVAILABLE" message) on a four-segment topic that fell under the +`{TopicPrefix}/Probe/#` wildcard: + +``` +{TopicPrefix}/Probe/{AgentUuid}/Available (raw "AVAILABLE" / "UNAVAILABLE" UTF-8 bytes) +``` + +That broke the contract that every payload under the Probe wildcard is a +JSON document envelope. Subscribers binding to `{TopicPrefix}/Probe/#` +expecting only Probe document JSON would receive a non-JSON Availability +payload and either crash or silently drop the message. + +Starting with this release the module emits agent Availability on a +dedicated, non-overlapping topic: + +``` +{TopicPrefix}/Agent/{AgentUuid}/Available (raw "AVAILABLE" / "UNAVAILABLE" UTF-8 bytes) +``` + +This matches the cppagent reference implementation +([mtconnect/cppagent](https://github.com/mtconnect/cppagent)) which also +publishes JSON document envelopes only under the Probe wildcard and emits +agent status on a dedicated agent / status topic. The Probe wildcard is +now pure-JSON for any subscriber. + +### Action required when upgrading + +* Subscribers that relied on the old `{TopicPrefix}/Probe/{AgentUuid}/Available` + topic to track agent Availability must subscribe to + `{TopicPrefix}/Agent/{AgentUuid}/Available` instead. +* Any retained-message cleanup tooling pointed at the old topic should be + updated to clear the new topic on broker rotations. + ## Contribution / Feedback - Please use the [Issues](https://github.com/TrakHound/MTConnect.NET/issues) tab to create issues for specific problems that you may encounter - Please feel free to use the [Pull Requests](https://github.com/TrakHound/MTConnect.NET/pulls) tab for any suggested improvements to the source code From d62a4e37494d3accf6d68c9bd432ce8cb2e7162b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 21:59:17 +0200 Subject: [PATCH 15/29] test(agent-module): pin MqttRelay shutdown null-guard policy Module.OnStop() unconditionally invoked _documentServer.Stop(). In Entity-mode the constructor only initialises _entityServer, so _documentServer was null and the shutdown raised a NullReferenceException, masking the real shutdown reason in the host service event log. Pin the lifecycle policy via a focused helper: * StopServers must be a total function over (null, null). * StopServers must invoke whichever stop action is provided. * A throwing document-server stop must not prevent the entity-server stop from running, so a partial shutdown does not leak handlers. This commit only adds the failing tests against MqttRelayLifecycle so that the helper is introduced under TDD in the follow-up fix commit. --- .../MqttRelayLifecycleStopTests.cs | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleStopTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleStopTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleStopTests.cs new file mode 100644 index 000000000..d0c2af405 --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleStopTests.cs @@ -0,0 +1,90 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the MqttRelay module shutdown policy. Prior to this guard + /// the module's OnStop() unconditionally invoked + /// _documentServer.Stop(). When the module was configured + /// with TopicStructure=Entity only _entityServer was + /// constructed, so _documentServer was null and the + /// invocation threw a + /// during agent shutdown. The lifecycle helper centralises the + /// null-guard so the policy is unit-testable without standing up an + /// MTConnect agent broker. + /// + /// Background: the bug was first observed when an Entity-mode relay + /// was stopped as part of a graceful agent shutdown and the NRE + /// surfaced in the host service event log, masking the real shutdown + /// reason. + /// + [TestFixture] + public class MqttRelayLifecycleStopTests + { + [Test] + public void StopServers_does_not_throw_when_both_servers_null() + { + // Entity-mode plus a constructor that did not initialise + // either server is the worst case; the helper must be a + // total function over (null, null). + Assert.DoesNotThrow( + () => MqttRelayLifecycle.StopServers(documentStop: null, entityStop: null)); + } + + [Test] + public void StopServers_invokes_document_stop_when_provided() + { + var documentStopped = false; + MqttRelayLifecycle.StopServers( + documentStop: () => documentStopped = true, + entityStop: null); + + Assert.That(documentStopped, Is.True, + "Document-mode shutdown must invoke the document-server stop action."); + } + + [Test] + public void StopServers_invokes_entity_stop_when_provided() + { + var entityStopped = false; + MqttRelayLifecycle.StopServers( + documentStop: null, + entityStop: () => entityStopped = true); + + Assert.That(entityStopped, Is.True, + "Entity-mode shutdown must invoke the entity-server stop action."); + } + + [Test] + public void StopServers_invokes_both_when_both_provided() + { + var documentStopped = false; + var entityStopped = false; + + MqttRelayLifecycle.StopServers( + documentStop: () => documentStopped = true, + entityStop: () => entityStopped = true); + + Assert.That(documentStopped, Is.True); + Assert.That(entityStopped, Is.True); + } + + [Test] + public void StopServers_swallows_document_stop_exception_and_runs_entity_stop() + { + // A throwing document-server stop must not prevent the + // entity-server stop from running; otherwise a partial + // shutdown leaks live handlers. + var entityStopped = false; + + Assert.DoesNotThrow(() => MqttRelayLifecycle.StopServers( + documentStop: () => throw new System.InvalidOperationException("doc"), + entityStop: () => entityStopped = true)); + + Assert.That(entityStopped, Is.True); + } + } +} From a839b41068509d1bbf460d4bcb2289e12cf41def Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:00:11 +0200 Subject: [PATCH 16/29] fix(agent-module): guard MqttRelay OnStop NRE in Entity mode OnStop() unconditionally called _documentServer.Stop(). When the module was constructed with TopicStructure=Entity only _entityServer was initialised, so the call raised NullReferenceException during agent shutdown and masked the real shutdown reason in the host service event log. Introduce MqttRelayLifecycle.StopServers, a small policy helper that takes optional per-server stop actions, tolerates null targets, and isolates each stop in its own try/catch so a throw on one path cannot leave the other server running. Route OnStop() through the helper so the null-guard is enforced uniformly and is unit-testable without standing up an MTConnect agent broker. Pinned by MqttRelayLifecycleStopTests. --- .../Module.cs | 8 +++- .../MqttRelayLifecycle.cs | 46 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index eb454740f..f88398e25 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -82,7 +82,13 @@ protected override void OnStartAfterLoad(bool initializeDataItems) protected override void OnStop() { - _documentServer.Stop(); + // Entity-mode constructs only _entityServer, so a bare + // _documentServer.Stop() here would raise an NRE during + // shutdown. Route through the lifecycle helper so each + // server stop is independently null-safe and exception-safe. + MqttRelayLifecycle.StopServers( + documentStop: _documentServer != null ? (Action)(() => _documentServer.Stop()) : null, + entityStop: null); if (_stop != null) _stop.Cancel(); diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs new file mode 100644 index 000000000..7271ad56a --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs @@ -0,0 +1,46 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; + +namespace MTConnect +{ + /// + /// Centralises the MqttRelay agent module shutdown policy. The + /// MqttRelay module previously called _documentServer.Stop() + /// unconditionally from OnStop(); when configured for + /// TopicStructure=Entity only _entityServer is + /// constructed, so the document-server reference was null + /// and shutdown raised a . + /// Encapsulating the policy here makes the null-guard unit-testable + /// without instantiating an MTConnect agent broker, and keeps the + /// per-server stop independent so a throw on one path cannot leave + /// the other server running. + /// + public static class MqttRelayLifecycle + { + /// + /// Invokes the supplied per-server stop actions, tolerating + /// either or both being null. A throw from one action + /// does not prevent the other from running. + /// + /// + /// Optional action stopping the Document-mode server. + /// + /// + /// Optional action stopping the Entity-mode server. + /// + public static void StopServers(Action documentStop, Action entityStop) + { + if (documentStop != null) + { + try { documentStop(); } catch { } + } + + if (entityStop != null) + { + try { entityStop(); } catch { } + } + } + } +} From e110a0975fbdf2b6f7cbac2dcc1efa1973b390eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:01:10 +0200 Subject: [PATCH 17/29] test(agent-module): pin MqttRelay shutdown disconnect-await policy Module.OnStop() previously invoked _mqttClient.DisconnectAsync(...) as a fire-and-forget task: the returned Task was never awaited, any fault on the disconnect path was silently dropped, and the host process risked exiting before the disconnect actually completed. That hid broker errors at shutdown from the operator. Pin the lifecycle disconnect policy via a focused helper: * Successful disconnect must not invoke the fault logger. * A faulted disconnect Task must surface its exception to the logger. * A disconnect that never completes must not hang shutdown; the helper bounds the wait and treats the timeout as best-effort success. * A factory that throws synchronously must be caught and routed to the fault logger. * A null disconnect factory must no-op (worker never ran path). Tests fail to compile until DisconnectWithTimeout is introduced in the follow-up fix commit. --- .../MqttRelayLifecycleDisconnectTests.cs | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleDisconnectTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleDisconnectTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleDisconnectTests.cs new file mode 100644 index 000000000..44c95d47e --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/MqttRelayLifecycleDisconnectTests.cs @@ -0,0 +1,108 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; +using System.Threading.Tasks; +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the MqttRelay module disconnect-on-shutdown policy. The + /// previous OnStop() implementation called + /// _mqttClient.DisconnectAsync(...) as a fire-and-forget + /// task: the returned was never awaited and any + /// fault on the disconnect path was silently lost. That hid broker + /// errors at shutdown (so an admin diagnosing a hung shutdown could + /// not see the real cause) and risked the host process exiting + /// before the disconnect actually completed. + /// + /// The lifecycle helper now exposes a bounded await with a + /// fault-logging continuation. The policy: + /// + /// * Awaits the disconnect, but bounds the wait so a misbehaving + /// broker cannot hang shutdown forever. + /// * Surfaces a fault to a logging callback so the operator gets a + /// diagnostic instead of a silently-dropped exception. + /// * Treats the timeout itself as success (best-effort shutdown). + /// + [TestFixture] + public class MqttRelayLifecycleDisconnectTests + { + [Test] + public void DisconnectWithTimeout_returns_when_disconnect_completes() + { + // The disconnect task completes promptly; the helper must + // not block the shutdown thread or invoke the fault log. + string loggedFault = null; + + MqttRelayLifecycle.DisconnectWithTimeout( + disconnect: () => Task.CompletedTask, + timeout: TimeSpan.FromSeconds(1), + onFault: ex => loggedFault = ex.Message); + + Assert.That(loggedFault, Is.Null, + "Successful disconnect must not invoke the fault logger."); + } + + [Test] + public void DisconnectWithTimeout_logs_fault_when_disconnect_throws() + { + string loggedFault = null; + + MqttRelayLifecycle.DisconnectWithTimeout( + disconnect: () => Task.FromException(new InvalidOperationException("broker rejected")), + timeout: TimeSpan.FromSeconds(1), + onFault: ex => loggedFault = ex.Message); + + Assert.That(loggedFault, Is.EqualTo("broker rejected"), + "A faulted disconnect Task must surface its exception to the logger."); + } + + [Test] + public void DisconnectWithTimeout_returns_after_timeout_when_disconnect_hangs() + { + // A misbehaving broker that never completes the disconnect + // must not hang shutdown indefinitely; the helper bounds the + // wait and treats the timeout as best-effort success. + var sw = System.Diagnostics.Stopwatch.StartNew(); + + MqttRelayLifecycle.DisconnectWithTimeout( + disconnect: () => new TaskCompletionSource().Task, + timeout: TimeSpan.FromMilliseconds(50), + onFault: _ => { }); + + sw.Stop(); + Assert.That(sw.Elapsed, Is.LessThan(TimeSpan.FromSeconds(2)), + "Hung disconnect must bail out near the configured timeout."); + } + + [Test] + public void DisconnectWithTimeout_does_not_throw_when_disconnect_factory_throws_synchronously() + { + // A factory that throws before returning a Task represents a + // misbehaving client; the helper must catch synchronously + // and route the exception to the fault logger. + string loggedFault = null; + + Assert.DoesNotThrow(() => MqttRelayLifecycle.DisconnectWithTimeout( + disconnect: () => throw new InvalidOperationException("sync throw"), + timeout: TimeSpan.FromSeconds(1), + onFault: ex => loggedFault = ex.Message)); + + Assert.That(loggedFault, Is.EqualTo("sync throw")); + } + + [Test] + public void DisconnectWithTimeout_no_ops_when_disconnect_factory_is_null() + { + // The shutdown path must tolerate a null disconnect factory + // (for example when _mqttClient is null because the worker + // never ran). + Assert.DoesNotThrow(() => MqttRelayLifecycle.DisconnectWithTimeout( + disconnect: null, + timeout: TimeSpan.FromSeconds(1), + onFault: _ => { })); + } + } +} From 6251d52291ccff0c5b6812ac6d8f0b29efaca662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:02:15 +0200 Subject: [PATCH 18/29] fix(agent-module): bound MqttRelay shutdown disconnect await OnStop() called _mqttClient.DisconnectAsync(...) as a fire-and-forget task. The returned Task was never awaited, faults were silently dropped, and the host process risked exiting before the disconnect completed. Operators diagnosing a hung shutdown could not see the real broker error. Add MqttRelayLifecycle.DisconnectWithTimeout: a bounded synchronous wait on the disconnect Task with a fault-logging callback. Synchronous throws and Task faults route to the logger; a wait that elapses bails out best-effort and attaches an OnlyOnFaulted continuation so a late fault is not swallowed. Route OnStop() through the helper with a five second bound and a Warning-level log line. Pinned by MqttRelayLifecycleDisconnectTests. --- .../Module.cs | 21 +++-- .../MqttRelayLifecycle.cs | 81 +++++++++++++++++++ 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index f88398e25..991643ce7 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -92,14 +92,19 @@ protected override void OnStop() if (_stop != null) _stop.Cancel(); - try - { - if (_mqttClient != null) - { - _mqttClient.DisconnectAsync(MqttClientDisconnectOptionsReason.NormalDisconnection); - } - } - catch { } + // Bounded await on the disconnect: previously this was a + // fire-and-forget DisconnectAsync(...) whose returned Task + // was never awaited. Faults were silently dropped and the + // host process risked exiting before the disconnect + // completed. Route through the lifecycle helper so faults + // surface to the log and a hung broker cannot block + // shutdown indefinitely. + MqttRelayLifecycle.DisconnectWithTimeout( + disconnect: _mqttClient != null + ? (Func)(() => _mqttClient.DisconnectAsync(MqttClientDisconnectOptionsReason.NormalDisconnection)) + : null, + timeout: TimeSpan.FromSeconds(5), + onFault: ex => Log(MTConnectLogLevel.Warning, $"MQTT Relay Disconnect Error : {ex.Message}")); } private async Task Worker() diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs index 7271ad56a..f4d61d2ae 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/MqttRelayLifecycle.cs @@ -2,6 +2,7 @@ // TrakHound Inc. licenses this file to you under the MIT license. using System; +using System.Threading.Tasks; namespace MTConnect { @@ -42,5 +43,85 @@ public static void StopServers(Action documentStop, Action entityStop) try { entityStop(); } catch { } } } + + /// + /// Awaits an MQTT-client disconnect with a bounded timeout. + /// Replaces the previous fire-and-forget invocation in + /// OnStop(): + /// + /// * Synchronous throw from is + /// captured and routed to instead + /// of escaping the shutdown path. + /// * A faulted Task is surfaced to + /// with its inner exception. + /// * A disconnect that does not complete within + /// is treated as best-effort + /// success so a misbehaving broker cannot hang shutdown. + /// * A null is a no-op (the + /// worker may never have created an MQTT client). + /// + /// + /// Factory that begins the disconnect and returns its + /// . May be null. + /// + /// + /// Bound on how long shutdown is willing to wait. A timeout + /// elapses silently because the agent is already shutting down. + /// + /// + /// Logger callback invoked with the exception when the + /// disconnect throws synchronously or its Task faults. + /// + public static void DisconnectWithTimeout( + Func disconnect, + TimeSpan timeout, + Action onFault) + { + if (disconnect == null) return; + + Task task; + try + { + task = disconnect(); + } + catch (Exception ex) + { + if (onFault != null) onFault(ex); + return; + } + + if (task == null) return; + + try + { + if (!task.Wait(timeout)) + { + // Bounded wait elapsed; agent is shutting down so + // a hung broker cannot block process exit. Best + // effort: leave the task to be GC'd. Attach a + // continuation so its eventual fault is still + // surfaced rather than swallowed by the finalizer. + if (onFault != null) + { + task.ContinueWith( + t => onFault(t.Exception), + TaskContinuationOptions.OnlyOnFaulted); + } + return; + } + } + catch (AggregateException agg) + { + if (onFault != null) + { + var inner = agg.InnerException ?? agg; + onFault(inner); + } + } + catch (Exception ex) + { + if (onFault != null) onFault(ex); + } + } } } From 1eeaacf0d040a3cf414ec835d2481a2caddf2984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:03:10 +0200 Subject: [PATCH 19/29] test(agent-module): pin MqttRelay last-sent-sequence atomic policy Module.cs read and wrote the 64-bit _lastSentSequence field without Interlocked. On 32-bit hosts that is not atomic: a concurrent reader can observe a torn value (one half from the previous value, one from the new). MqttRelay reads the field from observation event handlers (multiple ThreadPool threads) while the durable-relay Worker writes it. A torn read can log a wildly wrong "unsent" figure and propagate to the persisted last-sent-sequence file, causing buffered observations to be skipped or re-sent on the next reconnect. Pin the atomic-access policy via a focused tracker: * Fresh tracker reads zero. * Write round-trips through Read. * Full ulong range (including high-bit values that map to negative long on Interlocked.Read) round-trips losslessly. * Last write wins: the tracker is not a max-watermark. * Concurrent writer/reader smoke test: the reader never observes a half-torn value (high half not as written by the writer). Tests fail to compile until LastSentSequenceTracker is introduced in the follow-up fix commit. --- .../LastSentSequenceTrackerTests.cs | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs new file mode 100644 index 000000000..cc6a623bb --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs @@ -0,0 +1,114 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System.Threading; +using System.Threading.Tasks; +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the atomic-access policy for the MqttRelay relay-progress + /// counter. Module.cs previously read and wrote + /// _lastSentSequence (a 64-bit field) without + /// , which on 32-bit hosts is not an + /// atomic operation: two 32-bit halves are read or written + /// independently and a concurrent reader can observe a torn value. + /// + /// MqttRelay reads _lastSentSequence from observation event + /// handlers (multiple ThreadPool threads) while the durable-relay + /// Worker writes it. A torn read could log a wildly wrong "unsent" + /// figure and, worse, be propagated to the persisted last-sent + /// sequence file, causing buffered observations to be skipped or + /// re-sent on the next reconnect. + /// + /// LastSentSequenceTracker centralises the atomic read/write so the + /// policy is unit-testable and so any future caller cannot + /// regress to a torn-read pattern. + /// + [TestFixture] + public class LastSentSequenceTrackerTests + { + [Test] + public void Read_returns_zero_on_fresh_tracker() + { + var tracker = new LastSentSequenceTracker(); + Assert.That(tracker.Read(), Is.EqualTo(0UL)); + } + + [Test] + public void Write_round_trips_through_read() + { + var tracker = new LastSentSequenceTracker(); + tracker.Write(42UL); + Assert.That(tracker.Read(), Is.EqualTo(42UL)); + } + + [Test] + public void Write_supports_full_ulong_range_round_trip() + { + // The broker sequence is unsigned; the tracker must round + // trip values whose bit pattern is negative when reinterpreted + // as a signed long (Interlocked operates on long internally). + var tracker = new LastSentSequenceTracker(); + tracker.Write(ulong.MaxValue); + Assert.That(tracker.Read(), Is.EqualTo(ulong.MaxValue)); + } + + [Test] + public void Write_overwrites_previous_value() + { + var tracker = new LastSentSequenceTracker(); + tracker.Write(100UL); + tracker.Write(50UL); + Assert.That(tracker.Read(), Is.EqualTo(50UL), + "Last write wins; the tracker is not a max-watermark."); + } + + [Test] + public void Concurrent_writes_and_reads_observe_only_written_values() + { + // Smoke-test for non-atomic torn read: launch a writer + // ramping through the high ulong range and a reader + // sampling the value; the reader must never observe a + // value that was never written. With Interlocked the + // assertion is trivially true; with naive long field the + // assertion would fail on 32-bit hosts. + var tracker = new LastSentSequenceTracker(); + const int iterations = 5000; + + var writer = Task.Run(() => + { + for (int i = 1; i <= iterations; i++) + { + tracker.Write((ulong)i | 0xFFFFFFFF00000000UL); + } + }); + + var observed = 0; + var reader = Task.Run(() => + { + while (!writer.IsCompleted) + { + var v = tracker.Read(); + // Either pre-write zero or any value the writer set. + var low = v & 0x00000000FFFFFFFFUL; + var high = v & 0xFFFFFFFF00000000UL; + if (v != 0UL) + { + Assert.That(high, Is.EqualTo(0xFFFFFFFF00000000UL), + "Torn read: high half not as written."); + Assert.That(low, Is.LessThanOrEqualTo((ulong)iterations)); + observed++; + } + } + }); + + Task.WaitAll(writer, reader); + + // Sanity: writer ran, reader did sample at least once. + Assert.That(tracker.Read(), Is.EqualTo((ulong)iterations | 0xFFFFFFFF00000000UL)); + Assert.That(observed, Is.GreaterThan(0)); + } + } +} From 4b5a860c40076fccf4a6002139d3e8635c5ee6f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:04:15 +0200 Subject: [PATCH 20/29] fix(agent-module): atomic 64-bit access for MqttRelay last-sent count Module.cs read and wrote the 64-bit _lastSentSequence field without Interlocked. On 32-bit hosts a 64-bit field is read and written as two 32-bit halves and a concurrent reader can observe a torn value. MqttRelay reads the counter from observation event handlers (multiple ThreadPool threads) while the durable-relay Worker writes it. A torn read could log a wildly wrong "unsent" figure and propagate to the persisted last-sent-sequence file. Introduce LastSentSequenceTracker, a small class that wraps every read/write in Interlocked.Read / Interlocked.Exchange so the access is atomic on every supported runtime. Replace the bare _lastSentSequence field with the tracker and route all three call sites (RelayBufferedObservations diagnostic compute and AgentObservationAdded durable-relay path) through Read/Write. Pinned by LastSentSequenceTrackerTests including a concurrent writer/reader smoke check that asserts no torn high-half values are observed. --- .../LastSentSequenceTracker.cs | 44 +++++++++++++++++++ .../Module.cs | 9 ++-- 2 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequenceTracker.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequenceTracker.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequenceTracker.cs new file mode 100644 index 000000000..e66ba60b0 --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequenceTracker.cs @@ -0,0 +1,44 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System.Threading; + +namespace MTConnect +{ + /// + /// Centralises atomic 64-bit access to the MqttRelay last-sent + /// sequence counter. Module.cs previously read and wrote a bare + /// field. On 32-bit hosts a 64-bit field is read + /// and written as two 32-bit halves and a concurrent reader can + /// observe a torn value. MqttRelay reads the counter from + /// observation event handlers (multiple ThreadPool threads) while + /// the durable-relay Worker writes it, so the access pattern was + /// genuinely racy. The tracker wraps every read/write in + /// primitives so the policy is enforced + /// uniformly and is unit-testable. + /// + public sealed class LastSentSequenceTracker + { + private long _value; + + /// + /// Returns the current sequence value with an atomic 64-bit + /// read. Round-trips the full range, + /// including values whose bit pattern is negative when + /// reinterpreted as a signed . + /// + public ulong Read() + { + return unchecked((ulong)Interlocked.Read(ref _value)); + } + + /// + /// Sets the current sequence value with an atomic 64-bit write. + /// Last write wins; this is not a max-watermark. + /// + public void Write(ulong value) + { + Interlocked.Exchange(ref _value, unchecked((long)value)); + } + } +} diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index 991643ce7..87a99e8c7 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -37,7 +37,9 @@ public class Module : MTConnectAgentModule private CancellationTokenSource _stop; private static readonly object _lastSentSequenceLock = new object(); private long _totalIncomingObservations = 0; - private long _lastSentSequence = 0; + // Atomic 64-bit access on 32-bit hosts: a bare long field would + // tear under the durable-relay observation event handlers. + private readonly LastSentSequenceTracker _lastSentSequenceTracker = new LastSentSequenceTracker(); private const string DirectoryBuffer = "buffer"; private const string LastSentSequenceFileName = "mqttrelay_last_sent.seq"; @@ -335,7 +337,8 @@ private async Task RelayBufferedObservations() } Log(MTConnectLogLevel.Information, $"[MQTT Relay] Buffered observation relay complete. {observations.Count} missed observations sent."); - var unsent = broker.LastSequence > (ulong)_lastSentSequence ? broker.LastSequence - (ulong)_lastSentSequence : 0; + var lastSentSnapshot = _lastSentSequenceTracker.Read(); + var unsent = broker.LastSequence > lastSentSnapshot ? broker.LastSequence - lastSentSnapshot : 0; if (unsent > 0) Log(MTConnectLogLevel.Information, $"[MQTT Relay] {unsent} new observations arrived during relay and will be sent next."); } @@ -700,7 +703,7 @@ private async void AgentObservationAdded(object sender, IObservation observation Interlocked.Increment(ref _totalIncomingObservations); var lastSent = GetLastSentSequence(); - _lastSentSequence = (long)lastSent; + _lastSentSequenceTracker.Write(lastSent); if (_entityServer != null) { From 89561dacc290c7511ba4e409ec912330d6be81b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:05:14 +0200 Subject: [PATCH 21/29] test(agent-module): pin MqttRelay async-void handler safety policy Module.cs exposes seven event handlers the agent broker invokes as async void. Three (AgentDeviceAdded, AgentObservationAdded, AgentAssetAdded) had no top-level try/catch; the other four guarded only the inner publish. An async-void method that throws routes its exception to the synchronization context, which on the ThreadPool tears down the host process. A formatter throw or a synchronous broker call (DataItem null-deref, broker shutting down) crashed the agent. Pin the safety policy via a focused guard helper: * Successful body must not invoke the fault logger. * Synchronous throw must route to the logger. * Async throw (Task fault) must route to the logger. * Null logger must not rethrow: the guard absorbs the fault either way. * Throwing logger must not corrupt the guard contract. * Null body must no-op (defensive against a mis-wired handler). Tests fail to compile until AsyncVoidGuard is introduced in the follow-up fix commit. --- .../AsyncVoidGuardTests.cs | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AsyncVoidGuardTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AsyncVoidGuardTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AsyncVoidGuardTests.cs new file mode 100644 index 000000000..c63947165 --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/AsyncVoidGuardTests.cs @@ -0,0 +1,106 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; +using System.Threading.Tasks; +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the MqttRelay async-void event-handler safety policy. The + /// module exposes seven event handlers that the agent broker + /// invokes as async void: + /// + /// AgentDeviceAdded, AgentObservationAdded, AgentAssetAdded, + /// ProbeReceived, CurrentReceived, SampleReceived, + /// AssetReceived + /// + /// An async void method that throws routes its exception to + /// the synchronization context, which on the ThreadPool tears down + /// the host process. Three of the seven handlers (AgentDeviceAdded, + /// AgentObservationAdded, AgentAssetAdded) had no top-level + /// try/catch; the other four guarded only the inner publish. A + /// throw from the formatter or from a synchronous broker call + /// (DataItem null-deref, broker shutting down) crashed the agent. + /// + /// AsyncVoidGuard.Run wraps the handler body and routes any + /// exception to a logging callback so the policy is enforced + /// uniformly and is unit-testable. Pinning the policy here also + /// prevents a future contributor from re-introducing an unguarded + /// async-void path. + /// + [TestFixture] + public class AsyncVoidGuardTests + { + [Test] + public async Task Run_completes_normally_when_body_succeeds() + { + string loggedFault = null; + + await AsyncVoidGuard.Run( + async () => { await Task.Yield(); }, + ex => loggedFault = ex.Message); + + Assert.That(loggedFault, Is.Null, + "Successful body must not invoke the fault logger."); + } + + [Test] + public async Task Run_routes_synchronous_throw_to_logger() + { + string loggedFault = null; + + await AsyncVoidGuard.Run( + () => throw new InvalidOperationException("sync throw"), + ex => loggedFault = ex.Message); + + Assert.That(loggedFault, Is.EqualTo("sync throw")); + } + + [Test] + public async Task Run_routes_async_throw_to_logger() + { + string loggedFault = null; + + await AsyncVoidGuard.Run( + async () => { await Task.Yield(); throw new InvalidOperationException("async throw"); }, + ex => loggedFault = ex.Message); + + Assert.That(loggedFault, Is.EqualTo("async throw")); + } + + [Test] + public async Task Run_does_not_rethrow_when_logger_is_null() + { + // Even with a null logger, the guard must swallow the + // exception so an async-void handler cannot crash the host. + await AsyncVoidGuard.Run( + () => throw new InvalidOperationException("no logger"), + onFault: null); + + Assert.Pass(); + } + + [Test] + public async Task Run_does_not_rethrow_when_logger_itself_throws() + { + // A misbehaving logger must not corrupt the guard + // contract: the originating handler must still complete. + await AsyncVoidGuard.Run( + () => throw new InvalidOperationException("body"), + ex => throw new InvalidOperationException("logger crashed")); + + Assert.Pass(); + } + + [Test] + public async Task Run_no_ops_when_body_is_null() + { + // A misuse case (handler delegate not wired); the guard + // must not throw NullReferenceException of its own. + await AsyncVoidGuard.Run(body: null, onFault: _ => { }); + Assert.Pass(); + } + } +} From 369d7dcb65320599c9b3e9d56f2307e163a0a506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:07:23 +0200 Subject: [PATCH 22/29] fix(agent-module): wrap MqttRelay async-void handlers with fault log Module.cs exposes seven event handlers the agent broker invokes as async void. Three of them (AgentDeviceAdded, AgentObservationAdded, AgentAssetAdded) had no top-level try/catch; the other four (ProbeReceived, CurrentReceived, SampleReceived, AssetReceived) guarded only the inner publish but not the formatter call. An async void method that throws routes its exception to the synchronization context, and on the ThreadPool that tears down the host process. So a formatter throw, a synchronous broker call (DataItem null-deref, broker shutting down), or an unexpected publish error crashed the agent. Add AsyncVoidGuard.Run, a tiny helper that awaits a delegate and routes any synchronous or async exception to a logging callback, never rethrowing. Wrap every async void handler with it. The four formatter handlers are split into a slim async void shell plus an async Task *Core method so the existing inner try/catch logic stays intact while the Core call itself is now guarded. Pinned by AsyncVoidGuardTests. --- .../AsyncVoidGuard.cs | 59 ++++++++ .../Module.cs | 128 +++++++++++++----- 2 files changed, 152 insertions(+), 35 deletions(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AsyncVoidGuard.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AsyncVoidGuard.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AsyncVoidGuard.cs new file mode 100644 index 000000000..0be86025a --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/AsyncVoidGuard.cs @@ -0,0 +1,59 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; +using System.Threading.Tasks; + +namespace MTConnect +{ + /// + /// Centralises the MqttRelay agent-module async void + /// event-handler safety policy. An async void method that + /// throws routes its exception to the synchronization context, + /// which on the ThreadPool tears down the host process. The + /// MqttRelay handlers run on broker-raised events, so without a + /// top-level guard a formatter throw (DataItem null-deref) or a + /// synchronous broker call (broker shutting down) crashed the + /// agent. + /// + /// Wrapping every handler body in guarantees that + /// any exception, synchronous or async, is captured and routed to + /// a logging callback rather than escaping to the synchronization + /// context. The guard is also unit-testable in isolation, so the + /// policy is pinned away from the (hard-to-mock) Module class. + /// + public static class AsyncVoidGuard + { + /// + /// Awaits the supplied handler body and routes any exception + /// to the logging callback. Never rethrows: the contract is + /// "an async-void path must not crash the host". A null body + /// is a no-op and a null or throwing logger is tolerated. + /// + /// + /// The handler implementation. May be null. + /// + /// + /// Logger callback invoked with any exception thrown + /// synchronously by or surfaced through + /// its . May be null. + /// + public static async Task Run(Func body, Action onFault) + { + if (body == null) return; + + try + { + var task = body(); + if (task != null) await task.ConfigureAwait(false); + } + catch (Exception ex) + { + if (onFault != null) + { + try { onFault(ex); } catch { } + } + } + } + } +} diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index 87a99e8c7..2bb525e18 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -391,6 +391,16 @@ private void SetLastSentSequence(ulong seq) } private async void ProbeReceived(IDevice device, IDevicesResponseDocument responseDocument) + { + // async void handler: route any unhandled fault (formatter + // throw, message-builder failure) to the log instead of the + // synchronization context. + await AsyncVoidGuard.Run( + async () => await ProbeReceivedCore(device, responseDocument), + ex => Log(MTConnectLogLevel.Warning, $"ProbeReceived handler error : {ex.Message}")); + } + + private async Task ProbeReceivedCore(IDevice device, IDevicesResponseDocument responseDocument) { if (_mqttClient != null && _mqttClient.IsConnected) { @@ -467,6 +477,15 @@ private async void ProbeReceived(IDevice device, IDevicesResponseDocument respon } private async void CurrentReceived(IDevice device, IStreamsResponseOutputDocument responseDocument) + { + // async void handler: route any unhandled fault to the log + // instead of the synchronization context. + await AsyncVoidGuard.Run( + async () => await CurrentReceivedCore(device, responseDocument), + ex => Log(MTConnectLogLevel.Warning, $"CurrentReceived handler error : {ex.Message}")); + } + + private async Task CurrentReceivedCore(IDevice device, IStreamsResponseOutputDocument responseDocument) { if (_mqttClient != null && _mqttClient.IsConnected) { @@ -507,6 +526,15 @@ private async void CurrentReceived(IDevice device, IStreamsResponseOutputDocumen } private async void SampleReceived(IDevice device, IStreamsResponseOutputDocument responseDocument) + { + // async void handler: route any unhandled fault to the log + // instead of the synchronization context. + await AsyncVoidGuard.Run( + async () => await SampleReceivedCore(device, responseDocument), + ex => Log(MTConnectLogLevel.Warning, $"SampleReceived handler error : {ex.Message}")); + } + + private async Task SampleReceivedCore(IDevice device, IStreamsResponseOutputDocument responseDocument) { if (_mqttClient != null && _mqttClient.IsConnected) { @@ -547,6 +575,15 @@ private async void SampleReceived(IDevice device, IStreamsResponseOutputDocument } private async void AssetReceived(IDevice device, IAssetsResponseDocument responseDocument) + { + // async void handler: route any unhandled fault to the log + // instead of the synchronization context. + await AsyncVoidGuard.Run( + async () => await AssetReceivedCore(device, responseDocument), + ex => Log(MTConnectLogLevel.Warning, $"AssetReceived handler error : {ex.Message}")); + } + + private async Task AssetReceivedCore(IDevice device, IAssetsResponseDocument responseDocument) { if (_mqttClient != null && _mqttClient.IsConnected && responseDocument != null && !responseDocument.Assets.IsNullOrEmpty()) { @@ -693,59 +730,80 @@ private async Task PublishAssets() private async void AgentDeviceAdded(object sender, IDevice device) { - if (_entityServer != null) await _entityServer.PublishDevice(_mqttClient, device); + // async void handlers must not throw; route any fault to + // the log instead of the synchronization context. + await AsyncVoidGuard.Run( + async () => + { + if (_entityServer != null) await _entityServer.PublishDevice(_mqttClient, device); + }, + ex => Log(MTConnectLogLevel.Warning, $"AgentDeviceAdded handler error : {ex.Message}")); } private async void AgentObservationAdded(object sender, IObservation observation) { - if (_configuration.DurableRelay) - { - Interlocked.Increment(ref _totalIncomingObservations); - - var lastSent = GetLastSentSequence(); - _lastSentSequenceTracker.Write(lastSent); - - if (_entityServer != null) + // async void handlers must not throw; route any fault to + // the log instead of the synchronization context. + await AsyncVoidGuard.Run( + async () => { - if (observation.Category == DataItemCategory.CONDITION) + if (_configuration.DurableRelay) { - var conditionObservations = Agent.GetCurrentObservations(observation.DeviceUuid, observation.DataItemId); - var result = await _entityServer.PublishObservations(_mqttClient, conditionObservations.OfType()); - if (result != null && result.IsSuccess && conditionObservations != null && conditionObservations.Any()) + Interlocked.Increment(ref _totalIncomingObservations); + + var lastSent = GetLastSentSequence(); + _lastSentSequenceTracker.Write(lastSent); + + if (_entityServer != null) { - SetLastSentSequence(conditionObservations.Max(o => o.Sequence)); + if (observation.Category == DataItemCategory.CONDITION) + { + var conditionObservations = Agent.GetCurrentObservations(observation.DeviceUuid, observation.DataItemId); + var result = await _entityServer.PublishObservations(_mqttClient, conditionObservations.OfType()); + if (result != null && result.IsSuccess && conditionObservations != null && conditionObservations.Any()) + { + SetLastSentSequence(conditionObservations.Max(o => o.Sequence)); + } + } + else + { + var result = await _entityServer.PublishObservation(_mqttClient, observation); + if (result != null && result.IsSuccess) + { + SetLastSentSequence(observation.Sequence); + } + } } } else { - var result = await _entityServer.PublishObservation(_mqttClient, observation); - if (result != null && result.IsSuccess) + if (_entityServer != null) { - SetLastSentSequence(observation.Sequence); + if (observation.Category == DataItemCategory.CONDITION) + { + var conditionObservations = Agent.GetCurrentObservations(observation.DeviceUuid, observation.DataItemId); + await PublishObservations(conditionObservations); + } + else + { + await _entityServer.PublishObservation(_mqttClient, observation); + } } } - } - } - else - { - if (_entityServer != null) - { - if (observation.Category == DataItemCategory.CONDITION) - { - var conditionObservations = Agent.GetCurrentObservations(observation.DeviceUuid, observation.DataItemId); - await PublishObservations(conditionObservations); - } - else - { - await _entityServer.PublishObservation(_mqttClient, observation); - } - } - } + }, + ex => Log(MTConnectLogLevel.Warning, $"AgentObservationAdded handler error : {ex.Message}")); } private async void AgentAssetAdded(object sender, IAsset asset) { - if (_entityServer != null) await _entityServer.PublishAsset(_mqttClient, asset); + // async void handlers must not throw; route any fault to + // the log instead of the synchronization context. + await AsyncVoidGuard.Run( + async () => + { + if (_entityServer != null) await _entityServer.PublishAsset(_mqttClient, asset); + }, + ex => Log(MTConnectLogLevel.Warning, $"AgentAssetAdded handler error : {ex.Message}")); } From efd006c0ded30316391d67e4ef93a495666bed3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:08:23 +0200 Subject: [PATCH 23/29] test(agent-module): pin MqttRelay Worker outer-catch logging policy The Worker do/while loop in Module.cs previously swallowed any unexpected exception escaping the inner try/catch with a bare "catch (Exception) { }". The relay quietly entered the reconnect delay branch and the underlying defect (a throw inside the inner finally, or an oversight in connection handling) went undiagnosed for the lifetime of the agent. Pin the policy via WorkerLoopExceptionLogger: * TaskCanceledException is the orderly-shutdown signal; do not log. * OperationCanceledException (parent type) is also a shutdown signal. * Any other exception is genuinely unexpected and must be logged with the exception type name and message. * Null exception is a no-op; null callback is tolerated. Tests fail to compile until the helper is introduced in the follow-up fix commit. --- .../WorkerLoopExceptionLoggerTests.cs | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/WorkerLoopExceptionLoggerTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/WorkerLoopExceptionLoggerTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/WorkerLoopExceptionLoggerTests.cs new file mode 100644 index 000000000..dcd4c0c04 --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/WorkerLoopExceptionLoggerTests.cs @@ -0,0 +1,116 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; +using System.Threading.Tasks; +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the MqttRelay Worker outer-catch logging policy. The + /// Worker do/while loop in Module.cs previously had: + /// + /// catch (TaskCanceledException) { } + /// catch (Exception) { } + /// + /// The bare empty catch on the outer loop swallowed any unexpected + /// exception escaping the inner try/catch (for example a throw + /// inside the inner finally block or an oversight in connection + /// handling). The operator saw nothing, the relay quietly entered + /// the reconnect-delay branch, and the underlying defect went + /// undiagnosed for the lifetime of the agent. + /// + /// WorkerLoopExceptionLogger encodes the policy: + /// + /// * TaskCanceledException is the orderly-shutdown signal; do not + /// log it (would spam the operator on every stop). + /// * Any other exception is genuinely unexpected at this scope; + /// log it at Warning so the underlying defect is visible. + /// + [TestFixture] + public class WorkerLoopExceptionLoggerTests + { + [Test] + public void Log_skips_TaskCanceledException() + { + var logged = false; + + WorkerLoopExceptionLogger.Log( + exception: new TaskCanceledException(), + onLog: _ => logged = true); + + Assert.That(logged, Is.False, + "TaskCanceledException is the orderly-shutdown signal; do not spam the log on every stop."); + } + + [Test] + public void Log_writes_unexpected_exception_to_callback() + { + string logged = null; + + WorkerLoopExceptionLogger.Log( + exception: new InvalidOperationException("boom"), + onLog: msg => logged = msg); + + Assert.That(logged, Is.Not.Null); + Assert.That(logged, Does.Contain("boom"), + "The unexpected-exception message must include the exception text so the operator can diagnose the defect."); + } + + [Test] + public void Log_includes_exception_type_name() + { + string logged = null; + + WorkerLoopExceptionLogger.Log( + exception: new InvalidOperationException("boom"), + onLog: msg => logged = msg); + + Assert.That(logged, Does.Contain(nameof(InvalidOperationException)), + "Type name aids in classifying the defect from log scrapes."); + } + + [Test] + public void Log_no_ops_when_exception_is_null() + { + var logged = false; + + WorkerLoopExceptionLogger.Log( + exception: null, + onLog: _ => logged = true); + + Assert.That(logged, Is.False, + "A null exception cannot be logged usefully; do not invoke the callback."); + } + + [Test] + public void Log_no_ops_when_callback_is_null() + { + // Defensive: the helper must not throw when the logger is + // not wired (would defeat the purpose of catching the + // unexpected exception). + Assert.DoesNotThrow(() => WorkerLoopExceptionLogger.Log( + exception: new InvalidOperationException("boom"), + onLog: null)); + } + + [Test] + public void Log_treats_subclass_of_TaskCanceledException_as_cancellation() + { + // OperationCanceledException is the parent type; a + // cancelation token expiring throws TaskCanceledException + // (a subclass). Future runtime versions could extend the + // hierarchy; the policy must treat any cancelation as + // an orderly-shutdown signal. + var logged = false; + + WorkerLoopExceptionLogger.Log( + exception: new OperationCanceledException(), + onLog: _ => logged = true); + + Assert.That(logged, Is.False, + "OperationCanceledException is also an orderly-shutdown signal; do not log."); + } + } +} From 9b8c0542918a3bbd10bfd3b9f8f790574d8a79c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:10:15 +0200 Subject: [PATCH 24/29] test(agent-module): stabilise LastSentSequenceTracker concurrent smoke The concurrent writer/reader smoke test asserted observed > 0 sample hits as a sanity check, but on a fast machine the writer completed before the reader took its first sample, failing the assertion without indicating a real defect. The atomicity assertions inside the reader loop are the load-bearing checks; the sample-count sanity was not. Drop the brittle observed-count assertion. Increase the iteration count (5_000 -> 100_000) so even a fast writer cannot beat the reader, and add a ManualResetEventSlim startGate so both tasks begin under the same observation window. The torn-read assertions inside the reader loop are unchanged and still load-bearing. --- .../LastSentSequenceTrackerTests.cs | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs index cc6a623bb..ea2f4cc5c 100644 --- a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequenceTrackerTests.cs @@ -75,40 +75,44 @@ public void Concurrent_writes_and_reads_observe_only_written_values() // assertion is trivially true; with naive long field the // assertion would fail on 32-bit hosts. var tracker = new LastSentSequenceTracker(); - const int iterations = 5000; + const int iterations = 100_000; + var startGate = new System.Threading.ManualResetEventSlim(false); var writer = Task.Run(() => { + startGate.Wait(); for (int i = 1; i <= iterations; i++) { tracker.Write((ulong)i | 0xFFFFFFFF00000000UL); } }); - var observed = 0; var reader = Task.Run(() => { + startGate.Wait(); while (!writer.IsCompleted) { var v = tracker.Read(); - // Either pre-write zero or any value the writer set. - var low = v & 0x00000000FFFFFFFFUL; - var high = v & 0xFFFFFFFF00000000UL; if (v != 0UL) { + var low = v & 0x00000000FFFFFFFFUL; + var high = v & 0xFFFFFFFF00000000UL; + // The high half is the canary: the writer + // never sets a value with high half != the + // sentinel, so observing anything else is a + // torn read. Assert.That(high, Is.EqualTo(0xFFFFFFFF00000000UL), "Torn read: high half not as written."); Assert.That(low, Is.LessThanOrEqualTo((ulong)iterations)); - observed++; } } }); + startGate.Set(); Task.WaitAll(writer, reader); - // Sanity: writer ran, reader did sample at least once. + // Final state must be the writer's last value. Assert.That(tracker.Read(), Is.EqualTo((ulong)iterations | 0xFFFFFFFF00000000UL)); - Assert.That(observed, Is.GreaterThan(0)); } } } From c96f5fdb7162436df5a96a5fe2974fcf20010312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:10:31 +0200 Subject: [PATCH 25/29] fix(agent-module): log unexpected MqttRelay Worker faults The Worker do/while outer-catch in Module.cs was a bare empty "catch (Exception) { }". Any unexpected exception escaping the inner try/catch (a throw inside the inner finally, an oversight in connection handling) was silently swallowed. The relay quietly entered the reconnect-delay branch and the underlying defect went undiagnosed for the lifetime of the agent. Introduce WorkerLoopExceptionLogger.Log: log any non-cancellation exception at Warning with the exception type name and message. TaskCanceledException and OperationCanceledException stay silent because they are the orderly-shutdown signal. Route the outer-catch through the helper so the policy is unit-testable. Pinned by WorkerLoopExceptionLoggerTests. --- .../Module.cs | 10 ++++- .../WorkerLoopExceptionLogger.cs | 39 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/WorkerLoopExceptionLogger.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index 2bb525e18..7a8fd260b 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -259,7 +259,15 @@ private async Task Worker() await Task.Delay(_configuration.ReconnectInterval, _stop.Token); } catch (TaskCanceledException) { } - catch (Exception) { } + catch (Exception ex) + { + // Route through WorkerLoopExceptionLogger so the + // operator sees an unexpected defect at the outer + // scope instead of having it silently swallowed. + WorkerLoopExceptionLogger.Log( + exception: ex, + onLog: msg => Log(MTConnectLogLevel.Warning, msg)); + } } while (!_stop.Token.IsCancellationRequested); } diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/WorkerLoopExceptionLogger.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/WorkerLoopExceptionLogger.cs new file mode 100644 index 000000000..5b31667ca --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/WorkerLoopExceptionLogger.cs @@ -0,0 +1,39 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; +using System.Threading.Tasks; + +namespace MTConnect +{ + /// + /// Encodes the MqttRelay Worker outer-catch logging policy. The + /// outer-loop catch in Module.Worker previously had a bare empty + /// catch on , swallowing any unexpected + /// exception escaping the inner try/catch. The relay quietly + /// entered the reconnect delay branch and the underlying defect + /// went undiagnosed for the lifetime of the agent. + /// + /// This helper centralizes the policy: skip the orderly-shutdown + /// cancelation signals, log everything else with type and message + /// so the operator can diagnose the defect from log scrapes. + /// + public static class WorkerLoopExceptionLogger + { + /// + /// Routes to + /// unless the exception is a cancelation signal + /// ( or its + /// subclass), in which case + /// the policy is silence (orderly shutdown). + /// + public static void Log(Exception exception, Action onLog) + { + if (exception == null) return; + if (onLog == null) return; + if (exception is OperationCanceledException) return; + + onLog($"MQTT Relay Worker unexpected error : {exception.GetType().Name} : {exception.Message}"); + } + } +} From 8350cc8c9744b23b312e9639a534108a47b5a0a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:12:01 +0200 Subject: [PATCH 26/29] test(agent-module): pin MqttRelay observation grouping single-pass Module.PublishObservations iterated the input enumerable up to three times per distinct DataItemId (Distinct, Where, FirstOrDefault). For n observations across k distinct data items the cost is O(n*k), and each iteration may re-execute a deferred upstream query (the broker's observation enumerator). On large agents (thousands of observations across hundreds of data items) the catch-up after a reconnect was materially slowed. Pin the grouping policy via a focused helper: * Null and empty input return empty. * Distinct DataItemId values become distinct groups. * The source enumerable is iterated exactly once (smoke-tested with a counting iterator). * Encounter order is preserved within each group so callers relying on sequence-monotonic ordering are not broken. * Null DataItemId is tolerated as its own (null-keyed) group rather than crashing the enumeration. Tests fail to compile until ObservationGrouper is introduced in the follow-up fix commit. --- .../ObservationGrouperTests.cs | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/ObservationGrouperTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/ObservationGrouperTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/ObservationGrouperTests.cs new file mode 100644 index 000000000..d411f4532 --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/ObservationGrouperTests.cs @@ -0,0 +1,179 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using MTConnect.Devices; +using MTConnect.Observations; +using MTConnect.Observations.Output; +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the MqttRelay PublishObservations grouping policy. The + /// previous implementation iterated the input enumerable up to + /// three times per distinct DataItemId: + /// + /// var dataItemIds = observations.Select(o => o.DataItemId).Distinct(); + /// foreach (var dataItemId in dataItemIds) + /// { + /// var dataItemObservations = observations.Where(o => o.DataItemId == dataItemId); + /// var dataItemObservation = dataItemObservations.FirstOrDefault(); + /// ... + /// } + /// + /// That is O(n*k) where n is the observation count and k is the + /// distinct-DataItemId count, plus repeated enumeration of an + /// IEnumerable that may be a deferred/expensive query. On large + /// agents (thousands of observations across hundreds of data + /// items) that materially slowed the relay catch-up after a + /// reconnect. + /// + /// ObservationGrouper.GroupByDataItem produces a single-pass + /// grouping: an IEnumerable of (DataItemId, observations) groups, + /// each iteration of the source happening exactly once. + /// + [TestFixture] + public class ObservationGrouperTests + { + [Test] + public void GroupByDataItem_returns_empty_when_input_null() + { + Assert.That(ObservationGrouper.GroupByDataItem(null), Is.Empty); + } + + [Test] + public void GroupByDataItem_returns_empty_when_input_empty() + { + Assert.That( + ObservationGrouper.GroupByDataItem(new List()), + Is.Empty); + } + + [Test] + public void GroupByDataItem_groups_observations_by_data_item_id() + { + var input = new List + { + Stub("A", DataItemCategory.SAMPLE, sequence: 1), + Stub("B", DataItemCategory.EVENT, sequence: 2), + Stub("A", DataItemCategory.SAMPLE, sequence: 3), + Stub("C", DataItemCategory.CONDITION, sequence: 4), + Stub("B", DataItemCategory.EVENT, sequence: 5), + }; + + var groups = ObservationGrouper.GroupByDataItem(input).ToList(); + + Assert.That(groups.Select(g => g.Key), Is.EquivalentTo(new[] { "A", "B", "C" })); + + var groupA = groups.Single(g => g.Key == "A").ToList(); + Assert.That(groupA.Select(o => o.Sequence), Is.EquivalentTo(new ulong[] { 1, 3 })); + + var groupB = groups.Single(g => g.Key == "B").ToList(); + Assert.That(groupB.Select(o => o.Sequence), Is.EquivalentTo(new ulong[] { 2, 5 })); + + var groupC = groups.Single(g => g.Key == "C").ToList(); + Assert.That(groupC.Select(o => o.Sequence), Is.EquivalentTo(new ulong[] { 4 })); + } + + [Test] + public void GroupByDataItem_iterates_source_exactly_once() + { + // Pins the perf-fix contract. The previous Module.cs path + // iterated the source up to three times (Distinct, Where, + // FirstOrDefault) per group. The grouping helper must take + // a single pass over the source. + var iterationCount = 0; + IEnumerable Source() + { + iterationCount++; + yield return Stub("A", DataItemCategory.SAMPLE, sequence: 1); + yield return Stub("B", DataItemCategory.EVENT, sequence: 2); + yield return Stub("A", DataItemCategory.SAMPLE, sequence: 3); + } + + var groups = ObservationGrouper.GroupByDataItem(Source()).ToList(); + // Force eager enumeration of every group. + foreach (var g in groups) _ = g.ToList(); + + Assert.That(iterationCount, Is.EqualTo(1), + "Grouping must iterate the source exactly once."); + Assert.That(groups, Has.Count.EqualTo(2)); + } + + [Test] + public void GroupByDataItem_preserves_first_seen_order_per_group() + { + // Useful for callers that rely on encounter order (the + // condition path) for sequence-monotonic publishing. + var input = new List + { + Stub("A", DataItemCategory.SAMPLE, sequence: 10), + Stub("A", DataItemCategory.SAMPLE, sequence: 11), + Stub("A", DataItemCategory.SAMPLE, sequence: 12), + }; + + var groupA = ObservationGrouper.GroupByDataItem(input).Single(); + Assert.That( + groupA.Select(o => o.Sequence).ToList(), + Is.EqualTo(new ulong[] { 10, 11, 12 })); + } + + [Test] + public void GroupByDataItem_handles_null_data_item_id_as_distinct_group() + { + // Defensive: an IObservationOutput stub with a null + // DataItemId should not crash the grouping; it should + // appear as its own (null-keyed) group so the caller can + // decide what to do with it. + var input = new List + { + Stub(null, DataItemCategory.SAMPLE, sequence: 1), + Stub("A", DataItemCategory.SAMPLE, sequence: 2), + Stub(null, DataItemCategory.SAMPLE, sequence: 3), + }; + + var groups = ObservationGrouper.GroupByDataItem(input).ToList(); + + Assert.That(groups, Has.Count.EqualTo(2)); + var nullGroup = groups.Single(g => g.Key == null).ToList(); + Assert.That(nullGroup.Select(o => o.Sequence), Is.EquivalentTo(new ulong[] { 1, 3 })); + } + + private static IObservationOutput Stub(string dataItemId, DataItemCategory category, ulong sequence) + { + return new ObservationOutputStub(dataItemId, category, sequence); + } + + private sealed class ObservationOutputStub : IObservationOutput + { + public ObservationOutputStub(string dataItemId, DataItemCategory category, ulong sequence) + { + DataItemId = dataItemId; + Category = category; + Sequence = sequence; + } + + public string DeviceUuid => "device-1"; + public IDataItem DataItem => null; + public string DataItemId { get; } + public DataItemCategory Category { get; } + public string Type => "TYPE"; + public string SubType => null; + public string Name => null; + public ulong InstanceId => 0UL; + public ulong Sequence { get; } + public DateTime Timestamp => DateTime.UtcNow; + public DateTimeOffset TimeZoneTimestamp => DateTimeOffset.UtcNow; + public string CompositionId => null; + public DataItemRepresentation Representation => DataItemRepresentation.VALUE; + public Quality Quality => Quality.VALID; + public bool Deprecated => false; + public bool Extended => false; + public ObservationValue[] Values => Array.Empty(); + public string GetValue(string valueKey) => null; + } + } +} From d9fd5c8f8f312aa192d26b2a3da1a85d027ffefe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:13:06 +0200 Subject: [PATCH 27/29] fix(agent-module): single-pass observation grouping in MqttRelay Module.PublishObservations iterated the input enumerable up to three times per distinct DataItemId (Distinct, Where, FirstOrDefault). For n observations across k distinct data items the cost was O(n*k) and each iteration could re-execute a deferred upstream broker query. On large agents that throttled the catch-up after a reconnect. Introduce ObservationGrouper.GroupByDataItem: materialise the source once and group by DataItemId, returning IGrouping entries that preserve encounter order within each group. Refactor PublishObservations to use a foreach over groups, materialise each group once into a List, and dispatch CONDITION versus single-value data items off the first item's category. Extract the IObservationOutput -> Observation copy into a static CloneAsObservation helper so the condition and non-condition paths share the projection. Pinned by ObservationGrouperTests. --- .../Module.cs | 95 +++++++++---------- .../ObservationGrouper.cs | 40 ++++++++ 2 files changed, 85 insertions(+), 50 deletions(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/ObservationGrouper.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index 7a8fd260b..8b13ead04 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -661,65 +661,60 @@ private async Task PublishCurrentObservations() private async Task PublishObservations(IEnumerable observations) { - if (!observations.IsNullOrEmpty()) + if (observations.IsNullOrEmpty()) return; + + // Single-pass grouping replaces the previous O(n*k) + // Distinct + Where + FirstOrDefault pattern that iterated + // the source up to three times per distinct DataItemId. + foreach (var group in ObservationGrouper.GroupByDataItem(observations)) { - var dataItemIds = observations.Select(o => o.DataItemId).Distinct(); - foreach (var dataItemId in dataItemIds) - { - var dataItemObservations = observations.Where(o => o.DataItemId == dataItemId); - var dataItemObservation = dataItemObservations.FirstOrDefault(); + // Materialise each group's observations once: the + // condition path needs to enumerate twice (test the + // first item's category and rebuild every observation) + // and re-iterating the IGrouping would re-iterate the + // upstream source. + var groupObservations = group.ToList(); + if (groupObservations.Count == 0) continue; - if (dataItemObservation.Category == DataItemCategory.CONDITION) - { - // Conditions have multiple observations - var multipleObservations = new List(); - foreach (var observation in dataItemObservations) - { - var x = new Observation(); - x.DeviceUuid = observation.DeviceUuid; - x.DataItemId = observation.DataItemId; - x.DataItem = observation.DataItem; - x.Name = observation.Name; - x.Category = observation.Category; - x.Type = observation.Type; - x.SubType = observation.SubType; - x.Representation = observation.Representation; - x.CompositionId = observation.CompositionId; - x.InstanceId = observation.InstanceId; - x.Sequence = observation.Sequence; - x.Timestamp = observation.Timestamp; - x.AddValues(observation.Values); - - multipleObservations.Add(x); - } + var first = groupObservations[0]; - await _entityServer.PublishObservations(_mqttClient, multipleObservations); - } - else + if (first.Category == DataItemCategory.CONDITION) + { + // Conditions have multiple observations + var multipleObservations = new List(groupObservations.Count); + foreach (var observation in groupObservations) { - var observation = dataItemObservations.FirstOrDefault(); - - var x = new Observation(); - x.DeviceUuid = observation.DeviceUuid; - x.DataItemId = observation.DataItemId; - x.DataItem = observation.DataItem; - x.Name = observation.Name; - x.Category = observation.Category; - x.Type = observation.Type; - x.SubType = observation.SubType; - x.Representation = observation.Representation; - x.CompositionId = observation.CompositionId; - x.InstanceId = observation.InstanceId; - x.Sequence = observation.Sequence; - x.Timestamp = observation.Timestamp; - x.AddValues(observation.Values); - - await _entityServer.PublishObservation(_mqttClient, x); + multipleObservations.Add(CloneAsObservation(observation)); } + + await _entityServer.PublishObservations(_mqttClient, multipleObservations); + } + else + { + await _entityServer.PublishObservation(_mqttClient, CloneAsObservation(first)); } } } + private static Observation CloneAsObservation(IObservationOutput source) + { + var x = new Observation(); + x.DeviceUuid = source.DeviceUuid; + x.DataItemId = source.DataItemId; + x.DataItem = source.DataItem; + x.Name = source.Name; + x.Category = source.Category; + x.Type = source.Type; + x.SubType = source.SubType; + x.Representation = source.Representation; + x.CompositionId = source.CompositionId; + x.InstanceId = source.InstanceId; + x.Sequence = source.Sequence; + x.Timestamp = source.Timestamp; + x.AddValues(source.Values); + return x; + } + private async Task PublishAssets() { if (_entityServer != null) diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/ObservationGrouper.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/ObservationGrouper.cs new file mode 100644 index 000000000..a09ae7748 --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/ObservationGrouper.cs @@ -0,0 +1,40 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Linq; +using MTConnect.Observations.Output; + +namespace MTConnect +{ + /// + /// Single-pass grouping helper for MqttRelay observation + /// publishing. Module.PublishObservations previously iterated the + /// input enumerable up to three times per distinct DataItemId + /// (Distinct, Where, FirstOrDefault); on large agents that + /// throttled the catch-up after a reconnect. This helper produces + /// the same logical grouping in a single pass. + /// + public static class ObservationGrouper + { + /// + /// Groups by + /// in a single + /// pass. Encounter order is preserved within each group so a + /// caller that relies on sequence-monotonic ordering is not + /// broken. A null source returns an empty result. + /// + public static IEnumerable> GroupByDataItem( + IEnumerable observations) + { + if (observations == null) return Enumerable.Empty>(); + + // Materialise into a List then GroupBy, so a deferred + // upstream query is iterated exactly once. (LINQ GroupBy + // is itself single-pass over its source, but ToList + // makes the contract explicit and lets the caller iterate + // the resulting groups multiple times safely.) + return observations.ToList().GroupBy(o => o.DataItemId); + } + } +} From ce0fd187a1a5a6a162588982ee966ca7af0d4ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:14:07 +0200 Subject: [PATCH 28/29] test(agent-module): pin MqttRelay last-sent-sequence persister policy Module.cs synchronously wrote to disk from every successful observation publish in AgentObservationAdded: File.WriteAllText(path, seq.ToString()); Under high-rate observation arrival under DurableRelay that put a synchronous disk write on every event-handler invocation, throttling the relay. The fix is to track the value in memory and flush only on a timer, on shutdown, and at batch boundaries. Pin the persister policy: * Update marks dirty and round-trips through Read. * Read does not clear dirty: only Flush establishes durable state. * TryFlush emits exactly one writer call when dirty, and clears dirty on success. Returns true if a write happened. * TryFlush no-ops when clean (avoids burning IOPS on idle ticks). * TryFlush keeps dirty when the writer throws so the next tick retries. * Initialize seeds the in-memory value from disk without marking dirty (no redundant flush after startup). * Last write wins (not a max-watermark). * Null writer is tolerated. Tests fail to compile until LastSentSequencePersister is introduced in the follow-up fix commit. --- .../LastSentSequencePersisterTests.cs | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequencePersisterTests.cs diff --git a/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequencePersisterTests.cs b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequencePersisterTests.cs new file mode 100644 index 000000000..95b996a0a --- /dev/null +++ b/tests/MTConnect.NET-AgentModule-MqttRelay-Tests/LastSentSequencePersisterTests.cs @@ -0,0 +1,135 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using NUnit.Framework; + +namespace MTConnect.AgentModule.MqttRelay.Tests +{ + /// + /// Pins the MqttRelay last-sent-sequence persistence policy under + /// DurableRelay. Module.cs previously synchronously wrote to disk + /// from every successful observation publish in + /// AgentObservationAdded: + /// + /// File.WriteAllText(path, seq.ToString()); + /// + /// Under high-rate observation arrival that put a synchronous disk + /// write on every event-handler invocation, throttling the relay. + /// LastSentSequencePersister introduces an in-memory tracker plus + /// an explicit dirty bit; the caller flushes to disk on a timer, + /// on shutdown, and after every batch boundary, instead of after + /// every observation. + /// + [TestFixture] + public class LastSentSequencePersisterTests + { + [Test] + public void Update_marks_dirty_and_round_trips_through_read() + { + var persister = new LastSentSequencePersister(); + + persister.Update(42UL); + + Assert.That(persister.Read(), Is.EqualTo(42UL)); + Assert.That(persister.IsDirty, Is.True, + "An in-memory update must mark the persister dirty so the caller knows to flush."); + } + + [Test] + public void Read_does_not_clear_dirty_bit() + { + var persister = new LastSentSequencePersister(); + persister.Update(7UL); + + _ = persister.Read(); + + Assert.That(persister.IsDirty, Is.True, + "Read must not clear dirty: only Flush establishes durable persistence."); + } + + [Test] + public void TryFlush_writes_only_when_dirty_and_clears_dirty_on_success() + { + var persister = new LastSentSequencePersister(); + persister.Update(99UL); + + ulong? written = null; + var flushed = persister.TryFlush(seq => written = seq); + + Assert.That(flushed, Is.True, + "TryFlush returns true when a write was actually emitted."); + Assert.That(written, Is.EqualTo(99UL)); + Assert.That(persister.IsDirty, Is.False, + "After a successful flush the persister must be clean so the next timer tick does not redundantly write."); + } + + [Test] + public void TryFlush_no_ops_when_not_dirty() + { + var persister = new LastSentSequencePersister(); + // Never updated; persister is clean. + var written = false; + var flushed = persister.TryFlush(_ => written = true); + + Assert.That(flushed, Is.False); + Assert.That(written, Is.False, + "A clean persister must not invoke the writer; that would burn IOPS for no progress."); + } + + [Test] + public void TryFlush_keeps_dirty_when_writer_throws() + { + // A failing write must not clear dirty: the caller wants + // the next timer tick to retry. + var persister = new LastSentSequencePersister(); + persister.Update(123UL); + + Assert.Throws( + () => persister.TryFlush(_ => throw new System.IO.IOException("disk full"))); + + Assert.That(persister.IsDirty, Is.True, + "A failed write must leave the persister dirty so the next flush retries."); + Assert.That(persister.Read(), Is.EqualTo(123UL)); + } + + [Test] + public void Update_is_a_last_write_wins_overwrite() + { + var persister = new LastSentSequencePersister(); + persister.Update(100UL); + persister.Update(50UL); + + Assert.That(persister.Read(), Is.EqualTo(50UL), + "Last write wins; the persister is not a max-watermark."); + } + + [Test] + public void Initialize_seeds_value_without_marking_dirty() + { + // Used at startup to load the persisted last-sent-sequence + // from disk: the in-memory value must reflect the on-disk + // value but the persister must be clean (no flush needed + // until a real update arrives). + var persister = new LastSentSequencePersister(); + persister.Initialize(500UL); + + Assert.That(persister.Read(), Is.EqualTo(500UL)); + Assert.That(persister.IsDirty, Is.False, + "Initialize seeds the in-memory value from disk and must not request a redundant flush."); + } + + [Test] + public void TryFlush_no_ops_when_writer_is_null() + { + var persister = new LastSentSequencePersister(); + persister.Update(7UL); + + // A null writer means the caller has not wired persistence + // (e.g. DurableRelay disabled at runtime); the persister + // must not throw. + Assert.DoesNotThrow(() => persister.TryFlush(null)); + // Dirty bit unchanged because no write happened. + Assert.That(persister.IsDirty, Is.True); + } + } +} From a1f722e73df75e1a78cff6681d0876d1bd6c7872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Otto=20Boly=C3=B3s?= Date: Mon, 27 Apr 2026 22:17:06 +0200 Subject: [PATCH 29/29] fix(agent-module): in-memory MqttRelay last-sent-seq with timed flush Module.cs synchronously read and wrote the last-sent-sequence file from the AgentObservationAdded handler on every observation under DurableRelay. On a high-rate stream that put a synchronous disk read plus a synchronous disk write on every ThreadPool callback, serialising the relay behind disk IO and crashing the host's IOPS budget. Wire LastSentSequencePersister into Module: * OnStartAfterLoad seeds the persister from ReadLastSentSequenceFromDisk (the only remaining hot disk read) and starts a one-second flush timer that calls FlushLastSentSequence. * RecordLastSentSequence (replacing SetLastSentSequence) just updates the in-memory persister; no IO on the hot path. * FlushLastSentSequence runs the actual File.WriteAllText under _lastSentSequenceLock and only when the persister is dirty. * RelayBufferedObservations reads in memory, calls RecordLastSentSequence per observation, and flushes once at the batch boundary so a crash before the next timer tick does not lose the batch's progress. * OnStop disposes the timer first, then flushes one last time, so a clean shutdown does not lose pending progress. * AgentObservationAdded no longer issues File.ReadAllText per event; the durable-relay path becomes pure in-memory work. Pinned by LastSentSequencePersisterTests. --- .../LastSentSequencePersister.cs | 92 ++++++++++++++++ .../Module.cs | 103 +++++++++++++++--- 2 files changed, 179 insertions(+), 16 deletions(-) create mode 100644 agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequencePersister.cs diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequencePersister.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequencePersister.cs new file mode 100644 index 000000000..26630b782 --- /dev/null +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/LastSentSequencePersister.cs @@ -0,0 +1,92 @@ +// Copyright (c) 2024 TrakHound Inc., All Rights Reserved. +// TrakHound Inc. licenses this file to you under the MIT license. + +using System; +using System.Threading; + +namespace MTConnect +{ + /// + /// Tracks the MqttRelay last-sent observation sequence in memory + /// and flushes to durable storage only when explicitly asked. + /// Module.cs previously wrote to disk synchronously from every + /// successful observation publish; on a high-rate stream that put + /// a disk write on every ThreadPool callback. This persister + /// decouples the event-handler hot path from disk IO: handlers + /// call , and a timer / shutdown / batch + /// boundary calls . + /// + /// All read/write of the value field uses + /// so the persister is safe to share + /// between event-handler threads and a flush timer thread on 32- + /// bit hosts where bare 64-bit access can tear. + /// + public sealed class LastSentSequencePersister + { + private long _value; + private int _dirty; // 0 = clean, 1 = dirty (Interlocked-managed) + + /// + /// Returns the current in-memory sequence value with an + /// atomic 64-bit read. Does not clear the dirty bit; only + /// establishes durable persistence. + /// + public ulong Read() + { + return unchecked((ulong)Interlocked.Read(ref _value)); + } + + /// + /// Whether the in-memory value has been modified since the + /// last successful flush. The flush timer can use this to + /// skip writes when the relay is idle. + /// + public bool IsDirty + { + get { return Interlocked.CompareExchange(ref _dirty, 0, 0) != 0; } + } + + /// + /// Records a new last-sent sequence value (last write wins) + /// and marks the persister dirty so the next flush will + /// emit a write. + /// + public void Update(ulong value) + { + Interlocked.Exchange(ref _value, unchecked((long)value)); + Interlocked.Exchange(ref _dirty, 1); + } + + /// + /// Seeds the in-memory value from durable storage at startup + /// without marking the persister dirty. The initial value + /// already matches disk so a flush would be redundant. + /// + public void Initialize(ulong value) + { + Interlocked.Exchange(ref _value, unchecked((long)value)); + Interlocked.Exchange(ref _dirty, 0); + } + + /// + /// Emits a write through if and only + /// if the persister is dirty, then clears the dirty bit. If + /// the writer throws, the dirty bit is preserved and the + /// exception propagates so the caller can log and retry on + /// the next tick. Returns true when a write was + /// actually emitted. + /// + public bool TryFlush(Action write) + { + if (write == null) return false; + if (Interlocked.CompareExchange(ref _dirty, 0, 0) == 0) return false; + + var snapshot = Read(); + // Clear dirty *after* a successful write so that a thrown + // writer leaves the persister dirty for the next attempt. + write(snapshot); + Interlocked.Exchange(ref _dirty, 0); + return true; + } + } +} diff --git a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs index 8b13ead04..6a4f215a6 100644 --- a/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs +++ b/agent/Modules/MTConnect.NET-AgentModule-MqttRelay/Module.cs @@ -37,9 +37,13 @@ public class Module : MTConnectAgentModule private CancellationTokenSource _stop; private static readonly object _lastSentSequenceLock = new object(); private long _totalIncomingObservations = 0; - // Atomic 64-bit access on 32-bit hosts: a bare long field would - // tear under the durable-relay observation event handlers. - private readonly LastSentSequenceTracker _lastSentSequenceTracker = new LastSentSequenceTracker(); + // In-memory persister: tracks the last-sent sequence under + // DurableRelay and flushes to disk only on a timer, on + // shutdown, and at batch boundaries (rather than on every + // observation, which serialised the relay behind disk IO). + private readonly LastSentSequencePersister _lastSentSequencePersister = new LastSentSequencePersister(); + private System.Threading.Timer _lastSentFlushTimer; + private static readonly TimeSpan LastSentFlushInterval = TimeSpan.FromSeconds(1); private const string DirectoryBuffer = "buffer"; private const string LastSentSequenceFileName = "mqttrelay_last_sent.seq"; @@ -79,6 +83,23 @@ protected override void OnStartAfterLoad(bool initializeDataItems) { _stop = new CancellationTokenSource(); + // Seed the in-memory persister from disk so the first + // RelayBufferedObservations diagnostic uses the durable + // value rather than zero. + if (_configuration.DurableRelay) + { + _lastSentSequencePersister.Initialize(ReadLastSentSequenceFromDisk()); + + // Start a background flush timer. The timer only emits + // a write when the persister is dirty so an idle relay + // does not burn IOPS. + _lastSentFlushTimer = new System.Threading.Timer( + _ => FlushLastSentSequence(), + null, + LastSentFlushInterval, + LastSentFlushInterval); + } + _ = Task.Run(Worker, _stop.Token); } @@ -92,6 +113,16 @@ protected override void OnStop() documentStop: _documentServer != null ? (Action)(() => _documentServer.Stop()) : null, entityStop: null); + // Flush any pending last-sent-sequence update before the + // timer is disposed, so a clean shutdown does not lose + // progress against the buffered-observation relay. + if (_lastSentFlushTimer != null) + { + try { _lastSentFlushTimer.Dispose(); } catch { } + _lastSentFlushTimer = null; + } + FlushLastSentSequence(); + if (_stop != null) _stop.Cancel(); // Bounded await on the disconnect: previously this was a @@ -294,7 +325,10 @@ private async Task RelayBufferedObservations() { if (Agent is IMTConnectAgentBroker broker) { - ulong lastSent = GetLastSentSequence(); + // Read in-memory: the persister was seeded from disk at + // OnStartAfterLoad. Avoid a synchronous file read on + // every relay attempt. + ulong lastSent = _lastSentSequencePersister.Read(); ulong from = Math.Max(lastSent + 1, broker.FirstSequence); ulong to = broker.LastSequence; @@ -340,12 +374,18 @@ private async Task RelayBufferedObservations() var result = await _entityServer.PublishObservation(_mqttClient, x); if (result != null && result.IsSuccess) { - SetLastSentSequence(x.Sequence); + RecordLastSentSequence(x.Sequence); } } + // Batch boundary: the buffered relay just ran a + // chunk of observations under DurableRelay; flush + // the persister so a crash before the next timer + // tick does not lose this batch's progress. + FlushLastSentSequence(); + Log(MTConnectLogLevel.Information, $"[MQTT Relay] Buffered observation relay complete. {observations.Count} missed observations sent."); - var lastSentSnapshot = _lastSentSequenceTracker.Read(); + var lastSentSnapshot = _lastSentSequencePersister.Read(); var unsent = broker.LastSequence > lastSentSnapshot ? broker.LastSequence - lastSentSnapshot : 0; if (unsent > 0) Log(MTConnectLogLevel.Information, $"[MQTT Relay] {unsent} new observations arrived during relay and will be sent next."); @@ -371,7 +411,10 @@ private static string GetLastSentSequenceFilePath(IAgentApplicationConfiguration return Path.Combine(baseDir, LastSentSequenceFileName); } - private ulong GetLastSentSequence() + // Disk read used at startup (Initialize) only. Hot-path reads + // go through _lastSentSequencePersister.Read(); this method + // is the single point of entry for the actual file IO. + private ulong ReadLastSentSequenceFromDisk() { if (!_configuration.DurableRelay) return 0; var agentConfig = Agent?.Configuration as IAgentApplicationConfiguration; @@ -387,14 +430,39 @@ private ulong GetLastSentSequence() return 0; } - private void SetLastSentSequence(ulong seq) + // Records a new in-memory last-sent sequence. The actual disk + // write happens out-of-band on the flush timer / shutdown / + // batch boundary via FlushLastSentSequence. + private void RecordLastSentSequence(ulong seq) { if (!_configuration.DurableRelay) return; // Default - var agentConfig = Agent?.Configuration as IAgentApplicationConfiguration; - var path = GetLastSentSequenceFilePath(agentConfig); - lock (_lastSentSequenceLock) + _lastSentSequencePersister.Update(seq); + } + + // Writes the in-memory persister value to disk if dirty. Safe + // to call from any thread; the inner File.WriteAllText runs + // under _lastSentSequenceLock so a timer tick cannot race a + // shutdown flush. + private void FlushLastSentSequence() + { + if (!_configuration.DurableRelay) return; + + try + { + var agentConfig = Agent?.Configuration as IAgentApplicationConfiguration; + var path = GetLastSentSequenceFilePath(agentConfig); + + _lastSentSequencePersister.TryFlush(seq => + { + lock (_lastSentSequenceLock) + { + File.WriteAllText(path, seq.ToString()); + } + }); + } + catch (Exception ex) { - File.WriteAllText(path, seq.ToString()); + Log(MTConnectLogLevel.Warning, $"MQTT Relay last-sent-sequence flush error : {ex.Message}"); } } @@ -754,8 +822,11 @@ await AsyncVoidGuard.Run( { Interlocked.Increment(ref _totalIncomingObservations); - var lastSent = GetLastSentSequence(); - _lastSentSequenceTracker.Write(lastSent); + // No disk read here: the persister was seeded + // at startup and is updated in-memory below. + // The previous GetLastSentSequence() call did + // a synchronous File.ReadAllText on every + // observation event. if (_entityServer != null) { @@ -765,7 +836,7 @@ await AsyncVoidGuard.Run( var result = await _entityServer.PublishObservations(_mqttClient, conditionObservations.OfType()); if (result != null && result.IsSuccess && conditionObservations != null && conditionObservations.Any()) { - SetLastSentSequence(conditionObservations.Max(o => o.Sequence)); + RecordLastSentSequence(conditionObservations.Max(o => o.Sequence)); } } else @@ -773,7 +844,7 @@ await AsyncVoidGuard.Run( var result = await _entityServer.PublishObservation(_mqttClient, observation); if (result != null && result.IsSuccess) { - SetLastSentSequence(observation.Sequence); + RecordLastSentSequence(observation.Sequence); } } }