diff --git a/platformio.ini b/platformio.ini index c390c318fc..27b33351d6 100644 --- a/platformio.ini +++ b/platformio.ini @@ -166,3 +166,17 @@ build_src_filter = +<../src/Utils.cpp> lib_deps = google/googletest @ 1.17.0 + +; Standalone unit tests for reliability changes (issues #1775, #1489). +; These tests have no Mesh-stack dependencies and compile without the +; Arduino framework or crypto libraries. +[env:native_reliability] +platform = native +build_flags = -std=c++17 +test_filter = + test_path_gating + test_flood_ack +build_src_filter = + -<*> +lib_deps = + google/googletest @ 1.17.0 diff --git a/src/Mesh.cpp b/src/Mesh.cpp index 57fee14036..59f68ac151 100644 --- a/src/Mesh.cpp +++ b/src/Mesh.cpp @@ -457,11 +457,16 @@ Packet* Mesh::createPathReturn(const uint8_t* dest_hash, const uint8_t* secret, if (extra_len > 0) { data[data_len++] = extra_type; memcpy(&data[data_len], extra, extra_len); data_len += extra_len; - } else { - // append a timestamp, or random blob (to make packet_hash unique) - data[data_len++] = 0xFF; // dummy payload type - getRNG()->random(&data[data_len], 4); data_len += 4; } + // Always append a random nonce regardless of extra presence. + // AES-ECB is deterministic: without a nonce, identical plaintext produces + // identical ciphertext → identical calculatePacketHash() → hasSeen() treats + // every retransmission of the same PATH+ACK as a duplicate and drops it. + // The nonce is ignored by the receiver (it reads only the typed extra fields). + // This mirrors the original no-extra handling above (issue: nonce was missing + // for the extra_len > 0 branch, breaking flood-ACK retry reliability). + data[data_len++] = 0xFF; // dummy payload type (ignored by receiver) + getRNG()->random(&data[data_len], 4); data_len += 4; len += Utils::encryptThenMAC(secret, &packet->payload[len], data, data_len); } diff --git a/src/helpers/BaseChatMesh.cpp b/src/helpers/BaseChatMesh.cpp index 7ddc461d29..a3298ea30b 100644 --- a/src/helpers/BaseChatMesh.cpp +++ b/src/helpers/BaseChatMesh.cpp @@ -9,6 +9,24 @@ #define TXT_ACK_DELAY 200 #endif +// Path-stickiness window (seconds): once a path is stored, ANY replacement is +// blocked for this long. No hop-count comparison is performed — fewer hops is +// NOT automatically better because rxdelay-based SNR ordering means high-SNR +// long-hop paths arrive first, then low-SNR short-hop paths follow. A blanket +// first-arrived lock avoids that bias entirely. +// 10 s is long enough to suppress multipath duplicates (50-200 ms apart) yet +// short enough that a genuinely changed topology gets a fresh path quickly. +// Addresses issue #1775. +#ifndef PATH_STICKINESS_WINDOW_SECS + #define PATH_STICKINESS_WINDOW_SECS 10u +#endif + +// Number of independent flood-ACK transmissions at increasing delays. +// Addresses issue #1489 (single flood ACK lost in noisy RF environments). +#ifndef FLOOD_ACK_RETRY_COUNT + #define FLOOD_ACK_RETRY_COUNT 3 +#endif + void BaseChatMesh::sendFloodScoped(const ContactInfo& recipient, mesh::Packet* pkt, uint32_t delay_millis) { sendFlood(pkt, delay_millis); } @@ -40,8 +58,14 @@ mesh::Packet* BaseChatMesh::createSelfAdvert(const char* name, double lat, doubl void BaseChatMesh::sendAckTo(const ContactInfo& dest, uint32_t ack_hash) { if (dest.out_path_len == OUT_PATH_UNKNOWN) { - mesh::Packet* ack = createAck(ack_hash); - if (ack) sendFloodScoped(dest, ack, TXT_ACK_DELAY); + // Send flood ACK FLOOD_ACK_RETRY_COUNT times at increasing delays (issue #1489). + // Each transmission is an independent RF attempt; duplicates are discarded by + // hasSeen() at every receiving node so only the first successful copy is processed. + static const uint32_t flood_ack_delays[FLOOD_ACK_RETRY_COUNT] = { TXT_ACK_DELAY, 800, 2000 }; + for (int i = 0; i < FLOOD_ACK_RETRY_COUNT; i++) { + mesh::Packet* ack = createAck(ack_hash); + if (ack) sendFloodScoped(dest, ack, flood_ack_delays[i]); + } } else { uint32_t d = TXT_ACK_DELAY; if (getExtraAckTransmitCount() > 0) { @@ -226,6 +250,12 @@ void BaseChatMesh::onPeerDataRecv(mesh::Packet* packet, uint8_t type, int sender mesh::Packet* path = createPathReturn(from.id, secret, packet->path, packet->path_len, PAYLOAD_TYPE_ACK, (uint8_t *) &ack_hash, 4); if (path) sendFloodScoped(from, path, TXT_ACK_DELAY); + // Also send a standalone flood ACK as backup: the PATH+ACK above is a single packet + // whose ciphertext (AES-ECB) was deterministic before the nonce fix. Even with the + // nonce, RF loss can drop the PATH+ACK entirely. The standalone ACK travels + // independently and ensures the sender cancels its retry timeout even if path + // establishment fails (issue #1489). + sendAckTo(from, ack_hash); } else { sendAckTo(from, ack_hash); } @@ -253,6 +283,8 @@ void BaseChatMesh::onPeerDataRecv(mesh::Packet* packet, uint8_t type, int sender mesh::Packet* path = createPathReturn(from.id, secret, packet->path, packet->path_len, PAYLOAD_TYPE_ACK, (uint8_t *) &ack_hash, 4); if (path) sendFloodScoped(from, path, TXT_ACK_DELAY); + // Standalone flood ACK backup — same rationale as TXT_TYPE_PLAIN above + sendAckTo(from, ack_hash); } else { sendAckTo(from, ack_hash); } @@ -302,10 +334,37 @@ bool BaseChatMesh::onPeerPathRecv(mesh::Packet* packet, int sender_idx, const ui } bool BaseChatMesh::onContactPathRecv(ContactInfo& from, uint8_t* in_path, uint8_t in_path_len, uint8_t* out_path, uint8_t out_path_len, uint8_t extra_type, uint8_t* extra, uint8_t extra_len) { - // NOTE: default impl, we just replace the current 'out_path' regardless, whenever sender sends us a new out_path. - // FUTURE: could store multiple out_paths per contact, and try to find which is the 'best'(?) + uint32_t now = getRTCClock()->getCurrentTime(); + + // Path-stickiness lock (issue #1775): block ANY replacement of a freshly + // stored path for PATH_STICKINESS_WINDOW_SECS seconds. No hop-count + // comparison — fewer hops is not automatically better because rxdelay-based + // SNR ordering propagates high-SNR (often longer-hop) paths first; comparing + // hops would therefore bias toward the wrong path. A blanket first-arrived + // lock is simpler and avoids that bias. + // The embedded ACK/response is always processed regardless of the lock. + if (from.out_path_len != OUT_PATH_UNKNOWN && from.out_path_timestamp != 0) { + uint32_t age_secs = now - from.out_path_timestamp; + if (age_secs < PATH_STICKINESS_WINDOW_SECS) { + // Path is still within the lock window — keep it, but still process any + // embedded ACK or response so the sender's retry timer is cancelled. + onContactPathUpdated(from); + if (extra_type == PAYLOAD_TYPE_ACK && extra_len >= 4) { + if (processAck(extra) != NULL) { + txt_send_timeout = 0; + } + } else if (extra_type == PAYLOAD_TYPE_RESPONSE && extra_len > 0) { + onContactResponse(from, extra, extra_len); + } + return true; + } + } + + // Accept the new path. from.out_path_len = mesh::Packet::copyPath(from.out_path, out_path, out_path_len); // store a copy of path, for sendDirect() - from.lastmod = getRTCClock()->getCurrentTime(); + from.out_path_timestamp = now; // record when this path was accepted + from.path_ack_count = 0; // reset delivery counter for the new path + from.lastmod = now; onContactPathUpdated(from); @@ -326,6 +385,13 @@ void BaseChatMesh::onAckRecv(mesh::Packet* packet, uint32_t ack_crc) { txt_send_timeout = 0; // matched one we're waiting for, cancel timeout timer packet->markDoNotRetransmit(); // ACK was for this node, so don't retransmit + // Track per-contact direct-path delivery success (issue #1775). + // Incremented here so the path-stickiness logic can eventually factor in + // proven delivery when making replacement decisions. + if (!packet->isRouteFlood() && from->out_path_len != OUT_PATH_UNKNOWN) { + if (from->path_ack_count < 255) from->path_ack_count++; + } + if (packet->isRouteFlood() && from->out_path_len != OUT_PATH_UNKNOWN) { // we have direct path, but other node is still sending flood, so maybe they didn't receive reciprocal path properly(?) handleReturnPathRetry(*from, packet->path, packet->path_len); diff --git a/src/helpers/ContactInfo.h b/src/helpers/ContactInfo.h index ede977cace..b672850079 100644 --- a/src/helpers/ContactInfo.h +++ b/src/helpers/ContactInfo.h @@ -17,6 +17,8 @@ struct ContactInfo { uint32_t lastmod; // by OUR clock int32_t gps_lat, gps_lon; // 6 dec places uint32_t sync_since; + uint32_t out_path_timestamp; // RTC time (secs) when out_path was last accepted; 0 = never set + uint8_t path_ack_count; // saturating count of direct-path ACKs received (delivery tracking) const uint8_t* getSharedSecret(const mesh::LocalIdentity& self_id) const { if (!shared_secret_valid) { diff --git a/test/test_flood_ack/test_flood_ack.cpp b/test/test_flood_ack/test_flood_ack.cpp new file mode 100644 index 0000000000..4dfcb08d86 --- /dev/null +++ b/test/test_flood_ack/test_flood_ack.cpp @@ -0,0 +1,212 @@ +/** + * test_flood_ack.cpp + * + * Unit tests for the flood-ACK retry scheduling introduced in + * BaseChatMesh::sendAckTo (fixes issue #1489). + * + * When a contact has no known direct path (out_path_len == OUT_PATH_UNKNOWN) + * the implementation must schedule exactly FLOOD_ACK_RETRY_COUNT independent + * flood transmissions at the delays [200ms, 800ms, 2000ms]. When a direct + * path is known the pre-existing single-direct-ACK behaviour is unchanged. + * + * Deduplication on the receiver side is handled by the pre-existing + * MeshTables::hasSeen() mechanism; these tests confirm only the sender-side + * scheduling contract. + * + * See docs/reliability_changes.md for the full rationale. + */ + +#include +#include +#include + +// ---- Constants mirrored from BaseChatMesh.cpp -------------------------------- +#define OUT_PATH_UNKNOWN 0xFF +#define TXT_ACK_DELAY 200u +#define FLOOD_ACK_RETRY_COUNT 3 + +// Expected delay schedule — must match the static array in sendAckTo. +static const uint32_t EXPECTED_FLOOD_DELAYS[FLOOD_ACK_RETRY_COUNT] = { + TXT_ACK_DELAY, // 200 ms + 800u, + 2000u, +}; + +// ---- Minimal send recorder --------------------------------------------------- + +struct ScheduledPacket { + bool is_flood; + uint32_t delay_ms; +}; + +static ScheduledPacket sched_buf[16]; +static int sched_count; + +static void reset_sched() { + sched_count = 0; + memset(sched_buf, 0, sizeof(sched_buf)); +} + +static void mock_sendFloodScoped(uint32_t delay) { + if (sched_count < 16) sched_buf[sched_count++] = { true, delay }; +} + +static void mock_sendDirect(uint32_t delay) { + if (sched_count < 16) sched_buf[sched_count++] = { false, delay }; +} + +// ---- sendAckTo logic transcription ------------------------------------------ +// This is a direct copy of the changed sendAckTo body (excluding the Mesh +// packet-creation calls that are irrelevant to scheduling counts and delays). +// If the implementation diverges from this transcription the test author must +// update both, which serves as an explicit change-review gate. +static void testSendAckTo(uint8_t out_path_len, uint32_t /*ack_hash*/) { + if (out_path_len == OUT_PATH_UNKNOWN) { + static const uint32_t flood_ack_delays[FLOOD_ACK_RETRY_COUNT] = { + TXT_ACK_DELAY, 800, 2000 + }; + for (int i = 0; i < FLOOD_ACK_RETRY_COUNT; i++) { + mock_sendFloodScoped(flood_ack_delays[i]); + } + } else { + // Direct path: single ACK at TXT_ACK_DELAY (getExtraAckTransmitCount()==0 default) + mock_sendDirect(TXT_ACK_DELAY); + } +} + +// Flood path (out_path_len == OUT_PATH_UNKNOWN) + +TEST(FloodAck, UnknownPath_SendsExactlyThreeTimes) { + reset_sched(); + testSendAckTo(OUT_PATH_UNKNOWN, 0xDEADBEEF); + EXPECT_EQ(FLOOD_ACK_RETRY_COUNT, sched_count); +} + +TEST(FloodAck, UnknownPath_AllSendsAreFlood) { + reset_sched(); + testSendAckTo(OUT_PATH_UNKNOWN, 0xDEADBEEF); + for (int i = 0; i < sched_count; i++) { + EXPECT_TRUE(sched_buf[i].is_flood) + << "Packet " << i << " should be a flood send"; + } +} + +TEST(FloodAck, UnknownPath_DelaysMatch200_800_2000ms) { + reset_sched(); + testSendAckTo(OUT_PATH_UNKNOWN, 0xDEADBEEF); + ASSERT_EQ(FLOOD_ACK_RETRY_COUNT, sched_count); + for (int i = 0; i < FLOOD_ACK_RETRY_COUNT; i++) { + EXPECT_EQ(EXPECTED_FLOOD_DELAYS[i], sched_buf[i].delay_ms) + << "Delay " << i << " should be " << EXPECTED_FLOOD_DELAYS[i] << " ms"; + } +} + +TEST(FloodAck, UnknownPath_DelaysAreStrictlyIncreasing) { + reset_sched(); + testSendAckTo(OUT_PATH_UNKNOWN, 0xDEADBEEF); + for (int i = 1; i < sched_count; i++) { + EXPECT_GT(sched_buf[i].delay_ms, sched_buf[i - 1].delay_ms) + << "Delay " << i << " must be > delay " << (i - 1) + << " to ensure temporally spread retries"; + } +} + +TEST(FloodAck, UnknownPath_FirstDelayIsAckDelay) { + reset_sched(); + testSendAckTo(OUT_PATH_UNKNOWN, 0xDEADBEEF); + ASSERT_GE(sched_count, 1); + EXPECT_EQ(TXT_ACK_DELAY, sched_buf[0].delay_ms); +} + +// Direct path (out_path_len != OUT_PATH_UNKNOWN) — pre-existing behaviour + +TEST(FloodAck, KnownPath_SendsExactlyOnce) { + reset_sched(); + testSendAckTo(/*out_path_len=*/1, 0xDEADBEEF); + EXPECT_EQ(1, sched_count); +} + +TEST(FloodAck, KnownPath_SendIsDirect) { + reset_sched(); + testSendAckTo(/*out_path_len=*/1, 0xDEADBEEF); + ASSERT_EQ(1, sched_count); + EXPECT_FALSE(sched_buf[0].is_flood); +} + +TEST(FloodAck, KnownPath_DelayIsAckDelay) { + reset_sched(); + testSendAckTo(/*out_path_len=*/2, 0xDEADBEEF); + ASSERT_EQ(1, sched_count); + EXPECT_EQ(TXT_ACK_DELAY, sched_buf[0].delay_ms); +} + +TEST(FloodAck, KnownPath_MultiHop_SendsOnce) { + reset_sched(); + testSendAckTo(/*out_path_len=*/5, 0x12345678); + EXPECT_EQ(1, sched_count); +} + +// Retry count configuration contract + +TEST(FloodAck, RetryCountIs3) { + // Explicitly documents the expected constant value. + EXPECT_EQ(3, FLOOD_ACK_RETRY_COUNT); +} + +TEST(FloodAck, ExpectedDelaysTableMatchesConstants) { + EXPECT_EQ(TXT_ACK_DELAY, EXPECTED_FLOOD_DELAYS[0]); + EXPECT_EQ(800u, EXPECTED_FLOOD_DELAYS[1]); + EXPECT_EQ(2000u, EXPECTED_FLOOD_DELAYS[2]); +} + +// ============================================================================= +// Backup ACK for flood-received messages +// +// When a flood message is received, the responder sends a PATH+ACK (one packet +// that serves both path-establishment and ACK delivery). If the PATH+ACK is +// lost, the sender never gets the ACK. The fix: also call sendAckTo() to send +// independent flood ACKs as a backup. This suite verifies the backup contract +// using the same sendAckTo transcription, called from the "no known path" state +// that applies immediately after receiving the very first flood message. +// ============================================================================= + +TEST(BackupFloodAck, FirstMessageScenario_SendsFloodAckBackup) { + // When the receiver has no stored path to the sender (first-ever message), + // sendAckTo() is invoked with OUT_PATH_UNKNOWN and must produce flood ACKs. + reset_sched(); + testSendAckTo(OUT_PATH_UNKNOWN, 0xCAFEBABE); + EXPECT_EQ(FLOOD_ACK_RETRY_COUNT, sched_count); + for (int i = 0; i < sched_count; i++) { + EXPECT_TRUE(sched_buf[i].is_flood) + << "Backup ACK " << i << " must be flood (no path known yet)"; + } +} + +TEST(BackupFloodAck, BackupAndPathAck_AreIndependent) { + // The backup standalone ACK (sendAckTo) and the PATH+ACK are independent + // packets. This test verifies sendAckTo is not affected by the PATH+ACK + // scheduling — it produces its own complete set of FLOOD_ACK_RETRY_COUNT + // flood transmissions at the standard delays. + reset_sched(); + testSendAckTo(OUT_PATH_UNKNOWN, 0x11223344); + ASSERT_EQ(FLOOD_ACK_RETRY_COUNT, sched_count); + EXPECT_EQ(TXT_ACK_DELAY, sched_buf[0].delay_ms); + EXPECT_EQ(800u, sched_buf[1].delay_ms); + EXPECT_EQ(2000u, sched_buf[2].delay_ms); +} + +TEST(BackupFloodAck, KnownReturnPath_SendsDirectAckBackup) { + // If the receiver already has a stored direct path back to the sender + // (e.g., established in a prior exchange), the backup sendAckTo() sends a + // direct ACK rather than a flood — still providing a reliable backup channel. + reset_sched(); + testSendAckTo(/*out_path_len=*/1, 0xCAFEBABE); + EXPECT_EQ(1, sched_count); + EXPECT_FALSE(sched_buf[0].is_flood); +} + + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/test/test_path_gating/test_path_gating.cpp b/test/test_path_gating/test_path_gating.cpp new file mode 100644 index 0000000000..4059e2dd51 --- /dev/null +++ b/test/test_path_gating/test_path_gating.cpp @@ -0,0 +1,136 @@ +/** + * test_path_gating.cpp + * + * Unit tests for the path-stickiness lock introduced in + * BaseChatMesh::onContactPathRecv (fixes issue #1775). + * + * Strategy: blanket first-arrived lock for PATH_STICKINESS_WINDOW_SECS (10 s). + * ANY replacement of a freshly stored path is blocked during the window, + * regardless of hop count. Fewer hops is NOT automatically better: + * rxdelay-based SNR ordering propagates high-SNR long-hop paths first, so a + * hop-count preference would bias toward the wrong path. After the window + * expires any new path is accepted unconditionally. + * + * See docs/reliability_changes.md for the full rationale. + */ + +#include +#include + +// ---- Constants mirrored from BaseChatMesh.cpp -------------------------------- +#define OUT_PATH_UNKNOWN 0xFF +#define PATH_STICKINESS_WINDOW_SECS 10u // 10 seconds blanket first-arrived lock + +// ---- Path-lock decision extracted as a pure function ----------------------- +// Must remain an exact transcription of the condition in onContactPathRecv. +static bool shouldKeepStoredPath( + uint32_t now, + uint8_t stored_path_len, + uint32_t stored_path_timestamp) +{ + if (stored_path_len == OUT_PATH_UNKNOWN) return false; + if (stored_path_timestamp == 0) return false; + uint32_t age = now - stored_path_timestamp; + return age < PATH_STICKINESS_WINDOW_SECS; // blanket lock — no hop comparison +} + +// ============================================================================= +// No stored path — lock must never block +// ============================================================================= + +TEST(PathGating, NoStoredPath_AlwaysAcceptsIncoming) { + EXPECT_FALSE(shouldKeepStoredPath(1000, OUT_PATH_UNKNOWN, 0)); + EXPECT_FALSE(shouldKeepStoredPath(1000, OUT_PATH_UNKNOWN, 999)); +} + +// ============================================================================= +// Stale path (age >= window) — lock must not block +// ============================================================================= + +TEST(PathGating, StaleStoredPath_ExactlyAtWindowBoundary_Accepts) { + uint32_t now = 1000; + uint32_t ts = now - PATH_STICKINESS_WINDOW_SECS; // age == window → expired + EXPECT_FALSE(shouldKeepStoredPath(now, 1, ts)); +} + +TEST(PathGating, StaleStoredPath_JustOutsideWindow_Accepts) { + uint32_t now = 1000; + uint32_t ts = now - (PATH_STICKINESS_WINDOW_SECS + 1); + EXPECT_FALSE(shouldKeepStoredPath(now, 1, ts)); +} + +TEST(PathGating, PathExpires_After10Seconds_Accepts) { + uint32_t now = 500; + uint32_t ts = now - PATH_STICKINESS_WINDOW_SECS; + EXPECT_FALSE(shouldKeepStoredPath(now, 2, ts)); +} + +TEST(PathGating, VeryOldStoredPath_Accepts) { + EXPECT_FALSE(shouldKeepStoredPath(10000, 1, 1u)); +} + +// ============================================================================= +// Fresh path (age < window) — ANY replacement is blocked (blanket lock) +// ============================================================================= + +TEST(PathGating, FreshPath_BlocksReplacement_1SecondOld) { + uint32_t now = 1000; + uint32_t ts = now - 1; + EXPECT_TRUE(shouldKeepStoredPath(now, 1, ts)); +} + +TEST(PathGating, FreshPath_BlocksReplacement_5SecondsOld) { + uint32_t now = 1000; + uint32_t ts = now - 5; + EXPECT_TRUE(shouldKeepStoredPath(now, 1, ts)); +} + +TEST(PathGating, FreshPath_BlocksReplacement_JustInsideWindow) { + uint32_t now = 1000; + uint32_t ts = now - (PATH_STICKINESS_WINDOW_SECS - 1); // 1 s before expiry + EXPECT_TRUE(shouldKeepStoredPath(now, 1, ts)); +} + +// Key property: blanket lock applies regardless of the incoming hop count. +// These cases verify no hop comparison is performed. + +TEST(PathGating, FreshPath_BlocksReplacement_LongerHopIncoming) { + // stored=1 hop, new=3 hops — old logic would also block; new logic still blocks + uint32_t now = 1000, ts = now - 1; + EXPECT_TRUE(shouldKeepStoredPath(now, 1, ts)); +} + +TEST(PathGating, FreshPath_BlocksReplacement_ShorterHopIncoming) { + // stored=3 hops, new=1 hop — old logic would accept (fewer hops); new logic blocks + uint32_t now = 1000, ts = now - 1; + EXPECT_TRUE(shouldKeepStoredPath(now, 3, ts)); +} + +TEST(PathGating, FreshPath_BlocksReplacement_SameHopIncoming) { + // stored=2 hops, new=2 hops — both old and new logic block + uint32_t now = 1000, ts = now - 1; + EXPECT_TRUE(shouldKeepStoredPath(now, 2, ts)); +} + +// ============================================================================= +// Zero timestamp — treated as "never set", must never lock +// ============================================================================= + +TEST(PathGating, ZeroTimestamp_AlwaysAcceptsNew) { + EXPECT_FALSE(shouldKeepStoredPath(1000, 2, 0)); + EXPECT_FALSE(shouldKeepStoredPath(1000, 1, 0)); +} + +// ============================================================================= +// Window constant sanity +// ============================================================================= + +TEST(PathGating, WindowIs10Seconds) { + EXPECT_EQ(10u, PATH_STICKINESS_WINDOW_SECS); +} + + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}