diff --git a/src/foundation/str_util.c b/src/foundation/str_util.c index 6275ab59..9b896f42 100644 --- a/src/foundation/str_util.c +++ b/src/foundation/str_util.c @@ -282,10 +282,10 @@ bool cbm_validate_project_name(const char *name) { /* Reject leading dot (hidden files / relative refs) */ if (name[0] == '.') return false; - /* Allow only alphanumeric, dash, underscore, dot */ + /* Allow only alphanumeric, dash, underscore, dot, percent */ for (const char *p = name; *p; p++) { if (!(((*p >= 'a') && (*p <= 'z')) || ((*p >= 'A') && (*p <= 'Z')) || - ((*p >= '0') && (*p <= '9')) || *p == '-' || *p == '_' || *p == '.')) { + ((*p >= '0') && (*p <= '9')) || *p == '-' || *p == '_' || *p == '.' || *p == '%')) { return false; } } diff --git a/src/pipeline/fqn.c b/src/pipeline/fqn.c index 0da3e737..ef600614 100644 --- a/src/pipeline/fqn.c +++ b/src/pipeline/fqn.c @@ -11,6 +11,7 @@ #include #include // NULL #include +#include // SIZE_MAX #include #include // strdup @@ -18,6 +19,15 @@ #define FQN_MAX_PATH_SEGS 254 #define FQN_MAX_DIR_SEGS 255 +/* Cap the derived project name well under the common 255-byte filename-component + * limit (the name is later used as "/.db"). Percent-encoding + * non-ASCII bytes can triple the byte length, so a deep CJK path could otherwise + * exceed the limit and make the .db file un-openable (ENAMETOOLONG). When the + * slug is too long we truncate and append a short hash of the full slug so that + * distinct paths still map to distinct, recoverable names. */ +#define PROJECT_NAME_MAX_LEN 200 +#define PROJECT_NAME_HASH_HEX 8 + /* ── Internal helpers ─────────────────────────────────────────────── */ /* Build a dot-joined string from segments. Returns heap-allocated string. */ @@ -324,28 +334,46 @@ char *cbm_project_name_from_path(const char *abs_path) { return strdup("root"); } - /* Work on mutable copy */ - char *path = strdup(abs_path); - size_t len = strlen(path); + char *source = strdup(abs_path); + if (!source) { + return NULL; + } + cbm_normalize_path_sep(source); - /* Normalize path separators */ - cbm_normalize_path_sep(path); + size_t len = strlen(source); + if (len > (SIZE_MAX - SKIP_ONE) / 3) { + free(source); + return NULL; + } - /* Map every character cbm_validate_project_name would reject to '-'. The - * validator (used by resolve_store via project_db_path) allows only - * [A-Za-z0-9._-], so anything else — path separators, ':', spaces, '@', - * '+', unicode bytes, … — must be normalized here. Otherwise a repo like - * "/home/u/my project" yields the name "home-u-my project": indexing - * creates the DB and it shows in list_projects, but resolve_store rejects - * the space and reports project-not-found (#349). */ + /* Map path bytes to a cbm_validate_project_name-safe slug. ASCII behavior + * stays as before: only [A-Za-z0-9._-] is copied, while separators, ':', + * spaces, '@', '+', literal '%', ... become '-'. Non-ASCII UTF-8 bytes are + * percent-encoded so distinct path segments keep identifying data. */ + char *path = malloc((len * 3) + SKIP_ONE); + if (!path) { + free(source); + return NULL; + } + static const char hex[] = "0123456789ABCDEF"; + char *out = path; for (size_t i = 0; i < len; i++) { - unsigned char c = (unsigned char)path[i]; + unsigned char c = (unsigned char)source[i]; bool safe = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' || c == '_' || c == '-'; - if (!safe) { - path[i] = '-'; + if (safe) { + *out++ = (char)c; + } else if (c >= 0x80) { + *out++ = '%'; + *out++ = hex[c >> 4]; + *out++ = hex[c & 0x0F]; + } else { + *out++ = '-'; } } + *out = '\0'; + free(source); + len = (size_t)(out - path); /* Collapse consecutive dashes, and consecutive dots (the validator also * rejects any ".." sequence). */ @@ -377,6 +405,33 @@ char *cbm_project_name_from_path(const char *abs_path) { return strdup("root"); } + /* Bound the slug so "/.db" stays within the OS filename + * limit. On truncation, append "-<8 hex>" (FNV-1a of the full slug) so two + * long paths that share a prefix still produce distinct names. */ + size_t cur = strlen(start); + if (cur > PROJECT_NAME_MAX_LEN) { + uint32_t h = 2166136261u; /* FNV-1a offset basis */ + for (size_t i = 0; i < cur; i++) { + h ^= (unsigned char)start[i]; + h *= 16777619u; /* FNV-1a prime */ + } + size_t keep = PROJECT_NAME_MAX_LEN - (PROJECT_NAME_HASH_HEX + SKIP_ONE); + /* Don't end the kept prefix on a '-' (would collide with the separator + * and could leave a dangling dash before the hash). */ + while (keep > 0 && start[keep - SKIP_ONE] == '-') { + keep--; + } + char *result = malloc(keep + SKIP_ONE + PROJECT_NAME_HASH_HEX + SKIP_ONE); + if (!result) { + free(path); + return NULL; + } + memcpy(result, start, keep); + snprintf(result + keep, SKIP_ONE + PROJECT_NAME_HASH_HEX + SKIP_ONE, "-%08x", h); + free(path); + return result; + } + char *result = strdup(start); free(path); return result; diff --git a/tests/test_fqn.c b/tests/test_fqn.c index 9ff999e6..35fc79e9 100644 --- a/tests/test_fqn.c +++ b/tests/test_fqn.c @@ -454,6 +454,94 @@ TEST(project_name_consecutive_colons) { PASS(); } +TEST(project_name_percent_encodes_cjk) { + char *name = cbm_project_name_from_path( + "/Users/yunxin/Desktop/\xE5\xBC\x80\xE5\x8F\x91/\xE5\x90\x8E\xE7\xAB\xAF"); + ASSERT_NOT_NULL(name); + ASSERT_TRUE(name[0] != '\0'); + ASSERT_TRUE(strncmp(name, "Users-yunxin-Desktop-", strlen("Users-yunxin-Desktop-")) == 0); + ASSERT_TRUE(strstr(name, "%E5%BC%80%E5%8F%91") != NULL); + ASSERT_TRUE(strstr(name, "%E5%90%8E%E7%AB%AF") != NULL); + ASSERT_TRUE(cbm_validate_project_name(name)); + free(name); + PASS(); +} + +TEST(project_name_distinct_cjk_dirs) { + char *dev = cbm_project_name_from_path("/p/\xE5\xBC\x80\xE5\x8F\x91"); + char *backend = cbm_project_name_from_path("/p/\xE5\x90\x8E\xE7\xAB\xAF"); + ASSERT_NOT_NULL(dev); + ASSERT_NOT_NULL(backend); + ASSERT_TRUE(strcmp(dev, backend) != 0); + free(dev); + free(backend); + PASS(); +} + +TEST(project_name_ascii_regression_space) { + ASSERT_FQN(cbm_project_name_from_path("/home/u/my project"), "home-u-my-project"); + PASS(); +} + +TEST(project_name_validator_percent_and_rejections) { + ASSERT_TRUE(cbm_validate_project_name("Users-dev-%E5%BC%80%E5%8F%91")); + ASSERT_TRUE(!cbm_validate_project_name("a/b")); + ASSERT_TRUE(!cbm_validate_project_name("a\\b")); + ASSERT_TRUE(!cbm_validate_project_name("a..b")); + ASSERT_TRUE(!cbm_validate_project_name(".hidden")); + PASS(); +} + +/* A deep CJK path percent-encodes to many bytes; the slug must stay bounded + * (under the OS filename limit) yet remain validator-safe and distinct. */ +TEST(project_name_long_cjk_is_bounded) { + /* 60 repetitions of 开 (3 bytes -> "%E5%BC%80", 9 chars each) => far over the + * cap before bounding. */ + char buf[1 + (60 * 3) + 1]; + char *p = buf; + *p++ = '/'; + for (int i = 0; i < 60; i++) { + *p++ = (char)0xE5; + *p++ = (char)0xBC; + *p++ = (char)0x80; + } + *p = '\0'; + char *name = cbm_project_name_from_path(buf); + ASSERT_NOT_NULL(name); + ASSERT_TRUE(strlen(name) <= 200); + ASSERT_TRUE(cbm_validate_project_name(name)); + free(name); + PASS(); +} + +/* Two distinct long CJK paths must still map to distinct bounded slugs. */ +TEST(project_name_long_cjk_distinct) { + char a[1 + (60 * 3) + 1]; + char b[1 + (60 * 3) + 1]; + char *pa = a; + char *pb = b; + *pa++ = '/'; + *pb++ = '/'; + for (int i = 0; i < 60; i++) { + *pa++ = (char)0xE5; + *pa++ = (char)0xBC; + *pa++ = (char)0x80; /* 开 */ + *pb++ = (char)0xE5; + *pb++ = (char)0x90; + *pb++ = (char)0x8E; /* 后 */ + } + *pa = '\0'; + *pb = '\0'; + char *na = cbm_project_name_from_path(a); + char *nb = cbm_project_name_from_path(b); + ASSERT_NOT_NULL(na); + ASSERT_NOT_NULL(nb); + ASSERT_TRUE(strcmp(na, nb) != 0); + free(na); + free(nb); + PASS(); +} + /* issue #349: every derived project name must satisfy cbm_validate_project_name, * else the project is indexed + shown by list_projects but resolve_store rejects * the name → index_status/search_graph report project-not-found. */ @@ -582,4 +670,10 @@ SUITE(fqn) { RUN_TEST(project_name_colon_only); RUN_TEST(project_name_backslash_only); RUN_TEST(project_name_consecutive_colons); + RUN_TEST(project_name_percent_encodes_cjk); + RUN_TEST(project_name_distinct_cjk_dirs); + RUN_TEST(project_name_ascii_regression_space); + RUN_TEST(project_name_validator_percent_and_rejections); + RUN_TEST(project_name_long_cjk_is_bounded); + RUN_TEST(project_name_long_cjk_distinct); }