Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion docs/builtin_type.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,13 +401,25 @@ forcing the consumer to open the file just to size a playback window.
| Field | Type | Notes |
|-------|------|-------|
| `time_origin_ns` | `std::optional<Timestamp>` | Wall-clock instant of the first frame. Absent means the asset is not aligned to wall clock. |
| `duration_ns` | `std::optional<int64_t>` | Total playable duration. Absent means probe the file. |
| `start_ns` | `std::optional<int64_t>` | In-file offset (ns) where the playable window begins. Absent means "play from the start of the file". |
| `end_ns` | `std::optional<int64_t>` | In-file offset (ns) where the playable window ends. Absent means "play to the end of the file". |
| `file_path` | `std::string` | Absolute path or path relative to a consumer-known root. |
| `media_type` | `std::string` | MIME type hint. Empty means probe the file. |
| `width` | `uint32_t` | Pixel width. `0` means unknown. |
| `height` | `uint32_t` | Pixel height. `0` means unknown. |
| `frame_rate` | `double` | Nominal FPS. `0` or NaN means unknown. |

When both `start_ns` and `end_ns` are absent the whole file is the playable
window. When present, consumers must clamp seek requests to
`[start_ns, end_ns]` and bound timeline UI to that range. This is how
producers expose one clip out of a file that holds many concatenated
clips — for example LeRobot v3.0, where a single MP4 per camera packs
many episodes back-to-back and `[from_timestamp, to_timestamp]` in the
episode metadata maps directly to `[start_ns, end_ns]`.

The total file duration is *not* carried in the message — the decoder
backend reports it.

`pj_base/builtin/asset_video_codec.hpp` serializes and deserializes this
type using the canonical `PJ.AssetVideo` protobuf wire format.

Expand Down
12 changes: 9 additions & 3 deletions pj_base/include/pj_base/builtin/asset_video.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,13 @@ namespace sdk {
/// seek position. An absent value means the asset is not aligned to wall
/// clock and should not advance with the tracker.
///
/// `duration_ns` is the total playable duration when known; absent means
/// consumers may probe the file.
/// `start_ns` / `end_ns` describe an optional playable window inside the
/// file, expressed as in-file offsets in nanoseconds (zero = first frame
/// of the file). When both are absent the whole file is playable; when
/// present, consumers must clamp seek requests to `[start_ns, end_ns]`
/// and bound timeline UI to that range. Producers use this for assets
/// that share a file with other clips (e.g. LeRobot v3.0, where a single
/// MP4 holds many concatenated episodes per camera).
///
/// `media_type` is a MIME-type hint ("video/mp4", "video/x-matroska",
/// "video/av1"). Empty string means "probe the file".
Expand All @@ -48,7 +53,8 @@ namespace sdk {
/// average; actual per-frame timestamps come from the decoder.
struct AssetVideo {
std::optional<Timestamp> time_origin_ns; ///< Wall-clock instant of the first frame.
std::optional<int64_t> duration_ns; ///< Total playable duration in nanoseconds.
std::optional<int64_t> start_ns; ///< In-file offset where the playable window begins (ns).
std::optional<int64_t> end_ns; ///< In-file offset where the playable window ends (ns).
std::string file_path; ///< Absolute path or path relative to a consumer-known root.
std::string media_type; ///< MIME type. Empty string means "probe the file".
uint32_t width = 0; ///< Pixel width. 0 means unknown.
Expand Down
12 changes: 7 additions & 5 deletions pj_base/proto/pj/AssetVideo.proto
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

syntax = "proto3";

import "google/protobuf/duration.proto";
import "google/protobuf/timestamp.proto";

package PJ;
Expand All @@ -21,10 +20,6 @@ message AssetVideo {
// clock" and the asset will not advance with the tracker.
google.protobuf.Timestamp time_origin = 1;

// Total playable duration of the video. Producers should set this when known (FFmpeg probe at registration is
// cheap). Consumers may fall back to probing the file if absent. Optional.
optional google.protobuf.Duration duration = 2;

// Path to the video file. Resolution is consumer-side: producers should emit either an absolute path or a path
// relative to a consumer-known root (the dataset directory in LeRobot's case). Required.
string file_path = 3;
Expand All @@ -42,4 +37,11 @@ message AssetVideo {
// Nominal frame rate in frames per second. Zero or NaN means "unknown — probe the file". For variable-frame-rate
// video this is an advisory average; actual per-frame timestamps come from the decoder.
double frame_rate = 7;

// Optional playable window inside the file, expressed as in-file offsets in nanoseconds (zero = first frame of
// the file). When both are absent, the whole file is playable. When present, consumers must clamp seek requests
// to `[start_ns, end_ns]` and bound the timeline UI to that range. Used by producers that share a file across
// many clips (e.g. LeRobot v3.0, where one MP4 per camera holds many concatenated episodes).
optional int64 start_ns = 8;
optional int64 end_ns = 9;
}
46 changes: 30 additions & 16 deletions pj_base/src/builtin/asset_video_codec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,22 @@ std::vector<uint8_t> serializeAssetVideo(const AssetVideo& asset) {
std::vector<uint8_t> out;
Writer writer(out);

// Both `time_origin` and `duration` use the seconds+nanos wire shape;
// omit the field entirely when the SDK optional is empty.
// `time_origin` uses the seconds+nanos wire shape; omit the field entirely
// when the SDK optional is empty.
if (asset.time_origin_ns.has_value()) {
writer.message(1, [&](Writer& nested) { builtin_wire::writeTimestamp(nested, *asset.time_origin_ns); });
}
if (asset.duration_ns.has_value()) {
writer.message(2, [&](Writer& nested) { builtin_wire::writeTimestamp(nested, *asset.duration_ns); });
}
writer.string(3, asset.file_path);
writer.string(4, asset.media_type);
writer.varint(5, asset.width);
writer.varint(6, asset.height);
writer.doubleField(7, asset.frame_rate);
if (asset.start_ns.has_value()) {
writer.varint(8, static_cast<uint64_t>(*asset.start_ns));
}
if (asset.end_ns.has_value()) {
writer.varint(9, static_cast<uint64_t>(*asset.end_ns));
}

return out;
}
Expand All @@ -65,17 +68,6 @@ Expected<sdk::AssetVideo> deserializeAssetVideo(const uint8_t* data, size_t size
asset.time_origin_ns = t;
return true;
}
case 2: {
if (tag.type != WireType::kLengthDelimited) {
return false;
}
Timestamp d = 0;
if (!builtin_wire::readTimestampMessage(r, d)) {
return false;
}
asset.duration_ns = d;
return true;
}
case 3:
return tag.type == WireType::kLengthDelimited && r.readString(asset.file_path);
case 4:
Expand Down Expand Up @@ -104,6 +96,28 @@ Expected<sdk::AssetVideo> deserializeAssetVideo(const uint8_t* data, size_t size
}
case 7:
return tag.type == WireType::kFixed64 && r.readDouble(asset.frame_rate);
case 8: {
if (tag.type != WireType::kVarint) {
return false;
}
uint64_t v = 0;
if (!r.readVarint(v)) {
return false;
}
asset.start_ns = static_cast<int64_t>(v);
return true;
}
case 9: {
if (tag.type != WireType::kVarint) {
return false;
}
uint64_t v = 0;
if (!r.readVarint(v)) {
return false;
}
asset.end_ns = static_cast<int64_t>(v);
return true;
}
default:
return false;
}
Expand Down
40 changes: 36 additions & 4 deletions pj_base/tests/asset_video_codec_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ TEST(AssetVideoCodecTest, EmptyBufferProducesError) {
TEST(AssetVideoCodecTest, RoundTripFullyPopulated) {
AssetVideo in;
in.time_origin_ns = 1'700'000'000'000'000'000LL;
in.duration_ns = 60'000'000'000LL; // 60 s
in.start_ns = 12'000'000'000LL; // 12 s into the file
in.end_ns = 17'500'000'000LL; // 17.5 s into the file
in.file_path = "/data/2026-05-21/camera0.mp4";
in.media_type = "video/mp4";
in.width = 1920;
Expand All @@ -39,8 +40,10 @@ TEST(AssetVideoCodecTest, RoundTripFullyPopulated) {
ASSERT_TRUE(out.has_value());
ASSERT_TRUE(out->time_origin_ns.has_value());
EXPECT_EQ(*out->time_origin_ns, *in.time_origin_ns);
ASSERT_TRUE(out->duration_ns.has_value());
EXPECT_EQ(*out->duration_ns, *in.duration_ns);
ASSERT_TRUE(out->start_ns.has_value());
EXPECT_EQ(*out->start_ns, *in.start_ns);
ASSERT_TRUE(out->end_ns.has_value());
EXPECT_EQ(*out->end_ns, *in.end_ns);
EXPECT_EQ(out->file_path, in.file_path);
EXPECT_EQ(out->media_type, in.media_type);
EXPECT_EQ(out->width, in.width);
Expand All @@ -56,12 +59,41 @@ TEST(AssetVideoCodecTest, OptionalsAbsentRoundTrip) {
auto out = deserializeAssetVideo(bytes.data(), bytes.size());
ASSERT_TRUE(out.has_value());
EXPECT_FALSE(out->time_origin_ns.has_value());
EXPECT_FALSE(out->duration_ns.has_value());
EXPECT_FALSE(out->start_ns.has_value());
EXPECT_FALSE(out->end_ns.has_value());
EXPECT_EQ(out->file_path, in.file_path);
EXPECT_TRUE(out->media_type.empty());
EXPECT_EQ(out->width, 0u);
EXPECT_EQ(out->height, 0u);
}

TEST(AssetVideoCodecTest, OneBoundSetOneAbsent) {
// start_ns set, end_ns absent — consumers should clamp to start_ns and let
// the decoder reveal the file's true end.
AssetVideo in_start_only;
in_start_only.file_path = "/data/file.mp4";
in_start_only.start_ns = 5'000'000'000LL;

const auto b1 = serializeAssetVideo(in_start_only);
auto out1 = deserializeAssetVideo(b1.data(), b1.size());
ASSERT_TRUE(out1.has_value());
ASSERT_TRUE(out1->start_ns.has_value());
EXPECT_EQ(*out1->start_ns, *in_start_only.start_ns);
EXPECT_FALSE(out1->end_ns.has_value());

// end_ns set, start_ns absent — symmetric, lets producers cap playback
// without anchoring the start.
AssetVideo in_end_only;
in_end_only.file_path = "/data/file.mp4";
in_end_only.end_ns = 9'000'000'000LL;

const auto b2 = serializeAssetVideo(in_end_only);
auto out2 = deserializeAssetVideo(b2.data(), b2.size());
ASSERT_TRUE(out2.has_value());
EXPECT_FALSE(out2->start_ns.has_value());
ASSERT_TRUE(out2->end_ns.has_value());
EXPECT_EQ(*out2->end_ns, *in_end_only.end_ns);
}

} // namespace
} // namespace PJ
Loading