[ET Device Support] Parse device info from serialized tensor in tensor_parser (#18966)

pytorchbot · Gasoonjia · web-flow · commit e8487f3f91bc · 2026-04-16T21:53:48.000-07:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #18328 by @Gasoonjia ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/gasoonjia/143/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/gasoonjia/143/head Merge bot PR base: https://github.com/pytorch/executorch/tree/main Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/gasoonjia/143/orig Differential Revision: [D97199497](https://our.internmc.facebook.com/intern/diff/D97199497/) @diff-train-skip-merge Co-authored-by: gasoonjia <gasoonjia@icloud.com>
diff --git a/runtime/executor/tensor_parser_portable.cpp b/runtime/executor/tensor_parser_portable.cpp
@@ -147,6 +147,18 @@ Result<Tensor> parseTensor(
       Internal,
       "dim_order_to_stride returned invalid status");
 
+  // Extract device info from serialized tensor metadata.
+  // Defaults to CPU/0 for backward compatibility when extra_tensor_info is
+  // absent (e.g., older PTE files without device annotations).
+  auto device_type = executorch::runtime::etensor::DeviceType::CPU;
+  executorch::runtime::etensor::DeviceIndex device_index = 0;
+  if (s_tensor->extra_tensor_info() != nullptr) {
+    device_type = static_cast<executorch::runtime::etensor::DeviceType>(
+        s_tensor->extra_tensor_info()->device_type());
+    device_index = static_cast<executorch::runtime::etensor::DeviceIndex>(
+        s_tensor->extra_tensor_info()->device_index());
+  }
+
   auto* tensor_impl = method_allocator->allocateInstance<TensorImpl>();
   if (tensor_impl == nullptr) {
     return Error::MemoryAllocationFailed;
@@ -161,7 +173,9 @@ Result<Tensor> parseTensor(
       /*data=*/nullptr,
       dim_order,
       strides,
-      dynamism);
+      dynamism,
+      device_type,
+      device_index);
 
   // Now that we know how big the tensor is, find and assign its memory.
   Result<void*> data_ptr = getTensorDataPtr(
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
@@ -312,3 +312,19 @@ def define_common_targets(is_fbcode = False):
             ],
             env = modules_env,
         )
+
+        runtime.cxx_test(
+            name = "tensor_parser_device_test",
+            srcs = [
+                "tensor_parser_device_test.cpp",
+            ],
+            deps = [
+                ":managed_memory_manager",
+                "//executorch/runtime/executor:program",
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/schema:program",
+            ],
+            env = {
+                "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])",
+            },
+        )
diff --git a/runtime/executor/test/tensor_parser_device_test.cpp b/runtime/executor/test/tensor_parser_device_test.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * Tests that device info (device_type) is correctly parsed from serialized
+ * tensors in .pte files into TensorImpl at runtime.
+ *
+ * Uses a .pte exported with DeviceAwarePartitioner (CUDA device annotation)
+ * so that delegate output tensors carry device_type=CUDA in ExtraTensorInfo.
+ */
+
+#include <executorch/runtime/executor/tensor_parser.h>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/schema/program_generated.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::Tensor;
+using executorch::runtime::Error;
+using executorch::runtime::Program;
+using executorch::runtime::Result;
+using executorch::runtime::deserialization::parseTensor;
+using executorch::runtime::testing::ManagedMemoryManager;
+using torch::executor::util::FileDataLoader;
+
+constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
+constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
+
+namespace executorch {
+namespace runtime {
+namespace testing {
+class ProgramTestFriend final {
+ public:
+  const static executorch_flatbuffer::Program* GetInternalProgram(
+      const Program* program) {
+    return program->internal_program_;
+  }
+};
+} // namespace testing
+} // namespace runtime
+} // namespace executorch
+
+using executorch::runtime::testing::ProgramTestFriend;
+
+class TensorParserDeviceTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
+    ASSERT_NE(path, nullptr)
+        << "ET_MODULE_ADD_WITH_DEVICE_PATH env var not set";
+    Result<FileDataLoader> loader = FileDataLoader::from(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+  }
+
+  std::unique_ptr<FileDataLoader> loader_;
+};
+
+TEST_F(TensorParserDeviceTest, CUDADeviceParsedFromPteFile) {
+  Result<Program> program =
+      Program::load(loader_.get(), Program::Verification::Minimal);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+
+  const executorch_flatbuffer::Program* internal_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+  auto* execution_plan =
+      internal_program->execution_plan()->GetMutableObject(0);
+  auto* flatbuffer_values = execution_plan->values();
+
+  int cuda_tensor_count = 0;
+  int cpu_tensor_count = 0;
+
+  for (uint32_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto* serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+
+    auto* s_tensor = serialization_value->val_as_Tensor();
+
+    Result<Tensor> tensor = parseTensor(&program.get(), &mmm.get(), s_tensor);
+    if (!tensor.ok()) {
+      bool has_cuda = s_tensor->extra_tensor_info() != nullptr &&
+          s_tensor->extra_tensor_info()->device_type() ==
+              executorch_flatbuffer::DeviceType::CUDA;
+      if (has_cuda) {
+        cuda_tensor_count++;
+      }
+      continue;
+    }
+
+    Tensor t = tensor.get();
+    auto device_type = t.unsafeGetTensorImpl()->device_type();
+
+    if (device_type == executorch::runtime::etensor::DeviceType::CUDA) {
+      cuda_tensor_count++;
+      EXPECT_EQ(t.unsafeGetTensorImpl()->device_index(), 0)
+          << "CUDA tensor should have device_index=0";
+    } else {
+      EXPECT_EQ(device_type, executorch::runtime::etensor::DeviceType::CPU);
+      EXPECT_EQ(t.unsafeGetTensorImpl()->device_index(), 0)
+          << "CPU tensor should have device_index=0";
+      cpu_tensor_count++;
+    }
+  }
+
+  EXPECT_EQ(cuda_tensor_count, 3)
+      << "Expected 3 CUDA tensors (2 delegate inputs + 1 delegate output)";
+  EXPECT_EQ(cpu_tensor_count, 0)
+      << "Expected 0 CPU tensors (all annotated as CUDA)";
+}
+
+TEST_F(TensorParserDeviceTest, NonDelegatedTensorsDefaultToCPU) {
+  Result<Program> program =
+      Program::load(loader_.get(), Program::Verification::Minimal);
+  ASSERT_EQ(program.error(), Error::Ok);
+
+  ManagedMemoryManager mmm(kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+
+  const executorch_flatbuffer::Program* internal_program =
+      ProgramTestFriend::GetInternalProgram(&program.get());
+  auto* execution_plan =
+      internal_program->execution_plan()->GetMutableObject(0);
+  auto* flatbuffer_values = execution_plan->values();
+
+  for (uint32_t i = 0; i < flatbuffer_values->size(); ++i) {
+    auto* serialization_value = flatbuffer_values->Get(i);
+    if (serialization_value->val_type() !=
+        executorch_flatbuffer::KernelTypes::Tensor) {
+      continue;
+    }
+
+    auto* s_tensor = serialization_value->val_as_Tensor();
+    bool has_cuda_device = s_tensor->extra_tensor_info() != nullptr &&
+        s_tensor->extra_tensor_info()->device_type() ==
+            executorch_flatbuffer::DeviceType::CUDA;
+
+    // Only check tensors that are NOT annotated as CUDA
+    if (has_cuda_device) {
+      continue;
+    }
+
+    Result<Tensor> tensor = parseTensor(&program.get(), &mmm.get(), s_tensor);
+    if (!tensor.ok()) {
+      continue;
+    }
+
+    Tensor t = tensor.get();
+    EXPECT_EQ(
+        t.unsafeGetTensorImpl()->device_type(),
+        executorch::runtime::etensor::DeviceType::CPU)
+        << "Tensor at index " << i
+        << " without CUDA annotation should default to CPU";
+    EXPECT_EQ(t.unsafeGetTensorImpl()->device_index(), 0)
+        << "Tensor at index " << i
+        << " without device annotation should have device_index=0";
+  }
+}
diff --git a/test/models/export_program_with_device_info.py b/test/models/export_program_with_device_info.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+"""Exports a simple model with device-annotated tensors for C++ testing.
+
+Uses DeviceAwarePartitioner (BackendWithCompilerDemo + target_device=cuda:0)
+so that delegate output tensors are annotated with CUDA device in the .pte.
+"""
+
+import argparse
+import os
+from typing import Dict, final
+
+import torch
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
+    generate_pattern_op_partitions,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.test.backend_with_compiler_demo import (
+    BackendWithCompilerDemo,
+)
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.passes.propagate_device_pass import TARGET_DEVICE_COMPILE_SPEC_KEY
+from torch import nn
+from torch.export import export
+from torch.fx.passes.operator_support import any_chain, OperatorSupportBase
+
+
+class _AddOperatorSupport(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        return node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.add.Tensor,
+        ]
+
+
+@final
+class _DeviceAwarePartitioner(Partitioner):
+    """Partitioner that tags add ops for delegation with target_device=cuda:0."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.delegation_spec = DelegationSpec(
+            BackendWithCompilerDemo.__name__,
+            [
+                CompileSpec("max_value", bytes([4])),
+                CompileSpec(TARGET_DEVICE_COMPILE_SPEC_KEY, b"cuda:0"),
+            ],
+        )
+
+    def partition(self, exported_program) -> PartitionResult:
+        partition_tags: Dict[str, DelegationSpec] = {}
+        partition_list = generate_pattern_op_partitions(
+            exported_program.graph_module,
+            op_support=any_chain(_AddOperatorSupport()),
+        )
+        for partition in partition_list:
+            for node in partition.nodes:
+                tag = f"tag{partition.id}"
+                node.meta["delegation_tag"] = tag
+                partition_tags[tag] = self.delegation_spec
+        return PartitionResult(
+            tagged_exported_program=exported_program,
+            partition_tags=partition_tags,
+        )
+
+
+class ModuleAddWithDevice(nn.Module):
+    """Simple add model — the add op will be delegated with CUDA device annotation."""
+
+    def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+        return torch.add(a, b)
+
+    def get_random_inputs(self):
+        return (torch.randn(2, 2), torch.randn(2, 2))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--outdir", type=str, required=True)
+    args = parser.parse_args()
+
+    torch.manual_seed(0)
+    model = ModuleAddWithDevice()
+    inputs = model.get_random_inputs()
+
+    edge = to_edge(
+        export(model, inputs),
+        compile_config=EdgeCompileConfig(_check_ir_validity=False),
+    )
+    lowered = edge.to_backend(_DeviceAwarePartitioner())
+    et_prog = lowered.to_executorch(ExecutorchBackendConfig(emit_stacktrace=False))
+
+    os.makedirs(args.outdir, exist_ok=True)
+    outfile = os.path.join(args.outdir, "ModuleAddWithDevice.pte")
+
+    with open(outfile, "wb") as fp:
+        fp.write(et_prog.buffer)
+    print(f"Exported ModuleAddWithDevice to {outfile}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/models/targets.bzl b/test/models/targets.bzl
@@ -141,6 +141,27 @@ def define_common_targets():
         visibility = [],  # Private
     )
 
+    runtime.python_library(
+        name = "export_program_with_device_info_lib",
+        srcs = ["export_program_with_device_info.py"],
+        deps = [
+            "//caffe2:torch",
+            "//executorch/exir/backend/test:backend_with_compiler_demo",
+            "//executorch/exir:lib",
+        ],
+        visibility = [],  # Private
+    )
+
+    runtime.python_binary(
+        name = "export_program_with_device_info",
+        main_module = "executorch.test.models.export_program_with_device_info",
+        par_style = "xar",
+        deps = [
+            ":export_program_with_device_info_lib",
+        ],
+        visibility = [],  # Private
+    )
+
     runtime.python_binary(
         name = "export_delegated_program",
         main_module = "executorch.test.models.export_delegated_program",
@@ -196,6 +217,18 @@ def define_common_targets():
         ],
     )
 
+    runtime.genrule(
+        name = "exported_program_with_device_info",
+        cmd = "$(exe :export_program_with_device_info) --outdir $OUT",
+        outs = {
+            "ModuleAddWithDevice.pte": ["ModuleAddWithDevice.pte"],
+        },
+        default_outs = ["."],
+        visibility = [
+            "//executorch/runtime/executor/test/...",
+        ],
+    )
+
     runtime.genrule(
         name = "exported_xnnp_delegated_programs",
         cmd = "$(exe :export_delegated_program)" +