diff --git a/docs/builtin_type.md b/docs/builtin_type.md index c9c8947..6801b10 100644 --- a/docs/builtin_type.md +++ b/docs/builtin_type.md @@ -401,13 +401,25 @@ forcing the consumer to open the file just to size a playback window. | Field | Type | Notes | |-------|------|-------| | `time_origin_ns` | `std::optional` | Wall-clock instant of the first frame. Absent means the asset is not aligned to wall clock. | -| `duration_ns` | `std::optional` | Total playable duration. Absent means probe the file. | +| `start_ns` | `std::optional` | In-file offset (ns) where the playable window begins. Absent means "play from the start of the file". | +| `end_ns` | `std::optional` | In-file offset (ns) where the playable window ends. Absent means "play to the end of the file". | | `file_path` | `std::string` | Absolute path or path relative to a consumer-known root. | | `media_type` | `std::string` | MIME type hint. Empty means probe the file. | | `width` | `uint32_t` | Pixel width. `0` means unknown. | | `height` | `uint32_t` | Pixel height. `0` means unknown. | | `frame_rate` | `double` | Nominal FPS. `0` or NaN means unknown. | +When both `start_ns` and `end_ns` are absent the whole file is the playable +window. When present, consumers must clamp seek requests to +`[start_ns, end_ns]` and bound timeline UI to that range. This is how +producers expose one clip out of a file that holds many concatenated +clips — for example LeRobot v3.0, where a single MP4 per camera packs +many episodes back-to-back and `[from_timestamp, to_timestamp]` in the +episode metadata maps directly to `[start_ns, end_ns]`. + +The total file duration is *not* carried in the message — the decoder +backend reports it. + `pj_base/builtin/asset_video_codec.hpp` serializes and deserializes this type using the canonical `PJ.AssetVideo` protobuf wire format. diff --git a/pj_base/include/pj_base/builtin/asset_video.hpp b/pj_base/include/pj_base/builtin/asset_video.hpp index 6f18ddd..e410d38 100644 --- a/pj_base/include/pj_base/builtin/asset_video.hpp +++ b/pj_base/include/pj_base/builtin/asset_video.hpp @@ -34,8 +34,13 @@ namespace sdk { /// seek position. An absent value means the asset is not aligned to wall /// clock and should not advance with the tracker. /// -/// `duration_ns` is the total playable duration when known; absent means -/// consumers may probe the file. +/// `start_ns` / `end_ns` describe an optional playable window inside the +/// file, expressed as in-file offsets in nanoseconds (zero = first frame +/// of the file). When both are absent the whole file is playable; when +/// present, consumers must clamp seek requests to `[start_ns, end_ns]` +/// and bound timeline UI to that range. Producers use this for assets +/// that share a file with other clips (e.g. LeRobot v3.0, where a single +/// MP4 holds many concatenated episodes per camera). /// /// `media_type` is a MIME-type hint ("video/mp4", "video/x-matroska", /// "video/av1"). Empty string means "probe the file". @@ -48,7 +53,8 @@ namespace sdk { /// average; actual per-frame timestamps come from the decoder. struct AssetVideo { std::optional time_origin_ns; ///< Wall-clock instant of the first frame. - std::optional duration_ns; ///< Total playable duration in nanoseconds. + std::optional start_ns; ///< In-file offset where the playable window begins (ns). + std::optional end_ns; ///< In-file offset where the playable window ends (ns). std::string file_path; ///< Absolute path or path relative to a consumer-known root. std::string media_type; ///< MIME type. Empty string means "probe the file". uint32_t width = 0; ///< Pixel width. 0 means unknown. diff --git a/pj_base/proto/pj/AssetVideo.proto b/pj_base/proto/pj/AssetVideo.proto index 9a83ab9..f738c63 100644 --- a/pj_base/proto/pj/AssetVideo.proto +++ b/pj_base/proto/pj/AssetVideo.proto @@ -6,7 +6,6 @@ syntax = "proto3"; -import "google/protobuf/duration.proto"; import "google/protobuf/timestamp.proto"; package PJ; @@ -21,10 +20,6 @@ message AssetVideo { // clock" and the asset will not advance with the tracker. google.protobuf.Timestamp time_origin = 1; - // Total playable duration of the video. Producers should set this when known (FFmpeg probe at registration is - // cheap). Consumers may fall back to probing the file if absent. Optional. - optional google.protobuf.Duration duration = 2; - // Path to the video file. Resolution is consumer-side: producers should emit either an absolute path or a path // relative to a consumer-known root (the dataset directory in LeRobot's case). Required. string file_path = 3; @@ -42,4 +37,11 @@ message AssetVideo { // Nominal frame rate in frames per second. Zero or NaN means "unknown — probe the file". For variable-frame-rate // video this is an advisory average; actual per-frame timestamps come from the decoder. double frame_rate = 7; + + // Optional playable window inside the file, expressed as in-file offsets in nanoseconds (zero = first frame of + // the file). When both are absent, the whole file is playable. When present, consumers must clamp seek requests + // to `[start_ns, end_ns]` and bound the timeline UI to that range. Used by producers that share a file across + // many clips (e.g. LeRobot v3.0, where one MP4 per camera holds many concatenated episodes). + optional int64 start_ns = 8; + optional int64 end_ns = 9; } diff --git a/pj_base/src/builtin/asset_video_codec.cpp b/pj_base/src/builtin/asset_video_codec.cpp index 9adb126..0603f61 100644 --- a/pj_base/src/builtin/asset_video_codec.cpp +++ b/pj_base/src/builtin/asset_video_codec.cpp @@ -27,19 +27,22 @@ std::vector serializeAssetVideo(const AssetVideo& asset) { std::vector out; Writer writer(out); - // Both `time_origin` and `duration` use the seconds+nanos wire shape; - // omit the field entirely when the SDK optional is empty. + // `time_origin` uses the seconds+nanos wire shape; omit the field entirely + // when the SDK optional is empty. if (asset.time_origin_ns.has_value()) { writer.message(1, [&](Writer& nested) { builtin_wire::writeTimestamp(nested, *asset.time_origin_ns); }); } - if (asset.duration_ns.has_value()) { - writer.message(2, [&](Writer& nested) { builtin_wire::writeTimestamp(nested, *asset.duration_ns); }); - } writer.string(3, asset.file_path); writer.string(4, asset.media_type); writer.varint(5, asset.width); writer.varint(6, asset.height); writer.doubleField(7, asset.frame_rate); + if (asset.start_ns.has_value()) { + writer.varint(8, static_cast(*asset.start_ns)); + } + if (asset.end_ns.has_value()) { + writer.varint(9, static_cast(*asset.end_ns)); + } return out; } @@ -65,17 +68,6 @@ Expected deserializeAssetVideo(const uint8_t* data, size_t size asset.time_origin_ns = t; return true; } - case 2: { - if (tag.type != WireType::kLengthDelimited) { - return false; - } - Timestamp d = 0; - if (!builtin_wire::readTimestampMessage(r, d)) { - return false; - } - asset.duration_ns = d; - return true; - } case 3: return tag.type == WireType::kLengthDelimited && r.readString(asset.file_path); case 4: @@ -104,6 +96,28 @@ Expected deserializeAssetVideo(const uint8_t* data, size_t size } case 7: return tag.type == WireType::kFixed64 && r.readDouble(asset.frame_rate); + case 8: { + if (tag.type != WireType::kVarint) { + return false; + } + uint64_t v = 0; + if (!r.readVarint(v)) { + return false; + } + asset.start_ns = static_cast(v); + return true; + } + case 9: { + if (tag.type != WireType::kVarint) { + return false; + } + uint64_t v = 0; + if (!r.readVarint(v)) { + return false; + } + asset.end_ns = static_cast(v); + return true; + } default: return false; } diff --git a/pj_base/tests/asset_video_codec_test.cpp b/pj_base/tests/asset_video_codec_test.cpp index 181d33a..df38958 100644 --- a/pj_base/tests/asset_video_codec_test.cpp +++ b/pj_base/tests/asset_video_codec_test.cpp @@ -27,7 +27,8 @@ TEST(AssetVideoCodecTest, EmptyBufferProducesError) { TEST(AssetVideoCodecTest, RoundTripFullyPopulated) { AssetVideo in; in.time_origin_ns = 1'700'000'000'000'000'000LL; - in.duration_ns = 60'000'000'000LL; // 60 s + in.start_ns = 12'000'000'000LL; // 12 s into the file + in.end_ns = 17'500'000'000LL; // 17.5 s into the file in.file_path = "/data/2026-05-21/camera0.mp4"; in.media_type = "video/mp4"; in.width = 1920; @@ -39,8 +40,10 @@ TEST(AssetVideoCodecTest, RoundTripFullyPopulated) { ASSERT_TRUE(out.has_value()); ASSERT_TRUE(out->time_origin_ns.has_value()); EXPECT_EQ(*out->time_origin_ns, *in.time_origin_ns); - ASSERT_TRUE(out->duration_ns.has_value()); - EXPECT_EQ(*out->duration_ns, *in.duration_ns); + ASSERT_TRUE(out->start_ns.has_value()); + EXPECT_EQ(*out->start_ns, *in.start_ns); + ASSERT_TRUE(out->end_ns.has_value()); + EXPECT_EQ(*out->end_ns, *in.end_ns); EXPECT_EQ(out->file_path, in.file_path); EXPECT_EQ(out->media_type, in.media_type); EXPECT_EQ(out->width, in.width); @@ -56,12 +59,41 @@ TEST(AssetVideoCodecTest, OptionalsAbsentRoundTrip) { auto out = deserializeAssetVideo(bytes.data(), bytes.size()); ASSERT_TRUE(out.has_value()); EXPECT_FALSE(out->time_origin_ns.has_value()); - EXPECT_FALSE(out->duration_ns.has_value()); + EXPECT_FALSE(out->start_ns.has_value()); + EXPECT_FALSE(out->end_ns.has_value()); EXPECT_EQ(out->file_path, in.file_path); EXPECT_TRUE(out->media_type.empty()); EXPECT_EQ(out->width, 0u); EXPECT_EQ(out->height, 0u); } +TEST(AssetVideoCodecTest, OneBoundSetOneAbsent) { + // start_ns set, end_ns absent — consumers should clamp to start_ns and let + // the decoder reveal the file's true end. + AssetVideo in_start_only; + in_start_only.file_path = "/data/file.mp4"; + in_start_only.start_ns = 5'000'000'000LL; + + const auto b1 = serializeAssetVideo(in_start_only); + auto out1 = deserializeAssetVideo(b1.data(), b1.size()); + ASSERT_TRUE(out1.has_value()); + ASSERT_TRUE(out1->start_ns.has_value()); + EXPECT_EQ(*out1->start_ns, *in_start_only.start_ns); + EXPECT_FALSE(out1->end_ns.has_value()); + + // end_ns set, start_ns absent — symmetric, lets producers cap playback + // without anchoring the start. + AssetVideo in_end_only; + in_end_only.file_path = "/data/file.mp4"; + in_end_only.end_ns = 9'000'000'000LL; + + const auto b2 = serializeAssetVideo(in_end_only); + auto out2 = deserializeAssetVideo(b2.data(), b2.size()); + ASSERT_TRUE(out2.has_value()); + EXPECT_FALSE(out2->start_ns.has_value()); + ASSERT_TRUE(out2->end_ns.has_value()); + EXPECT_EQ(*out2->end_ns, *in_end_only.end_ns); +} + } // namespace } // namespace PJ