Skip to content

Commit 2d6ce23

Browse files
[SYCLomatic][PyTorch] Adding support for all possible torch.cuda and c10::cuda APIs & Macros (#2580)
* Adding support for torch.cuda and c10::cuda APIs * Added macro migration rules as well
1 parent fccacb8 commit 2d6ce23

26 files changed

Lines changed: 546 additions & 355 deletions

File tree

clang/lib/DPCT/AnalysisInfo.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -955,8 +955,12 @@ void DpctFileInfo::insertHeader(HeaderType Type, unsigned Offset,
955955
if (auto Iter = FirstIncludeOffset.find(
956956
DpctGlobalInfo::getInstance().getMainFile());
957957
Iter != FirstIncludeOffset.end())
958-
insertHeader("#include \"" + File + +"\"" + getNL(), Iter->second,
959-
InsertPosition::IP_Right);
958+
if (!File.empty() && File[0] == '<')
959+
insertHeader("#include " + File + getNL(), Iter->second,
960+
InsertPosition::IP_Right);
961+
else
962+
insertHeader("#include \"" + File + "\"" + getNL(), Iter->second,
963+
InsertPosition::IP_Right);
960964
}
961965
return;
962966

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,58 @@
11
from torch import xpu
22

3+
cuda_ver = torch.version.xpu
4+
5+
#init
6+
torch.xpu.init()
7+
xpu.init()
8+
is_init = torch.xpu.is_initialized()
9+
is_init = xpu.is_initialized()
10+
11+
# device APIs
312
devs = torch.xpu.device_count()
413
devs = xpu.device_count()
514

15+
dev = torch.xpu.current_device()
16+
dev = xpu.current_device()
17+
18+
torch.xpu.set_device(dev)
19+
xpu.set_device(dev)
20+
21+
d_props = torch.xpu.get_device_properties(dev)
22+
d_props = xpu.get_device_properties(dev)
23+
24+
curr_d_name = torch.xpu.get_device_name()
25+
curr_d_name = xpu.get_device_name()
26+
d_name = torch.xpu.get_device_name(dev)
27+
d_name = xpu.get_device_name(dev)
28+
629
d_cap = torch.xpu.get_device_capability()
730
d_cap = xpu.get_device_capability()
831
d0_cap = torch.xpu.get_device_capability(devs[0])
932
d0_cap = xpu.get_device_capability(devs[0])
1033

34+
dev_of_obj = torch.xpu.device_of(obj)
35+
dev_of_obj = xpu.device_of(obj)
36+
1137
arch_list = ['']
1238
arch_list = ['']
1339

14-
cuda_ver = torch.version.xpu
40+
torch.xpu.synchronize()
41+
xpu.synchronize()
42+
torch.xpu.synchronize(dev)
43+
xpu.synchronize(dev)
44+
45+
# stream APIs
46+
curr_st = torch.xpu.current_stream()
47+
curr_st = xpu.current_stream()
48+
curr_d_st = torch.xpu.current_stream(dev)
49+
curr_d_st = xpu.current_stream(dev)
50+
51+
st = torch.xpu.StreamContext(curr_st)
52+
st = xpu.StreamContext(curr_st)
53+
54+
stS = torch.xpu.stream(st)
55+
stS = xpu.stream(st)
56+
57+
torch.xpu.set_stream(st)
58+
xpu.set_stream(st)
Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,58 @@
11
from torch import cuda
22

3+
cuda_ver = torch.version.cuda
4+
5+
#init
6+
torch.cuda.init()
7+
cuda.init()
8+
is_init = torch.cuda.is_initialized()
9+
is_init = cuda.is_initialized()
10+
11+
# device APIs
312
devs = torch.cuda.device_count()
413
devs = cuda.device_count()
514

15+
dev = torch.cuda.current_device()
16+
dev = cuda.current_device()
17+
18+
torch.cuda.set_device(dev)
19+
cuda.set_device(dev)
20+
21+
d_props = torch.cuda.get_device_properties(dev)
22+
d_props = cuda.get_device_properties(dev)
23+
24+
curr_d_name = torch.cuda.get_device_name()
25+
curr_d_name = cuda.get_device_name()
26+
d_name = torch.cuda.get_device_name(dev)
27+
d_name = cuda.get_device_name(dev)
28+
629
d_cap = torch.cuda.get_device_capability()
730
d_cap = cuda.get_device_capability()
831
d0_cap = torch.cuda.get_device_capability(devs[0])
932
d0_cap = cuda.get_device_capability(devs[0])
1033

34+
dev_of_obj = torch.cuda.device_of(obj)
35+
dev_of_obj = cuda.device_of(obj)
36+
1137
arch_list = torch.cuda.get_arch_list()
1238
arch_list = cuda.get_arch_list()
1339

14-
cuda_ver = torch.version.cuda
40+
torch.cuda.synchronize()
41+
cuda.synchronize()
42+
torch.cuda.synchronize(dev)
43+
cuda.synchronize(dev)
44+
45+
# stream APIs
46+
curr_st = torch.cuda.current_stream()
47+
curr_st = cuda.current_stream()
48+
curr_d_st = torch.cuda.current_stream(dev)
49+
curr_d_st = cuda.current_stream(dev)
50+
51+
st = torch.cuda.StreamContext(curr_st)
52+
st = cuda.StreamContext(curr_st)
53+
54+
stS = torch.cuda.stream(st)
55+
stS = cuda.stream(st)
56+
57+
torch.cuda.set_stream(st)
58+
cuda.set_stream(st)

clang/test/dpct/pytorch/ATen.cu

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,68 @@
11
// RUN: rm -rf %T/pytorch/ATen
22
// RUN: mkdir -p %T/pytorch/ATen/src
33
// RUN: cp %S/ATen.cu %T/pytorch/ATen/src/
4-
// RUN: cp %S/user_defined_rule_pytorch.yaml %T/pytorch/ATen/
5-
// RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/ATen/
4+
// RUN: cp -r %S/pytorch_inc %T/pytorch/ATen/
65
// RUN: cd %T/pytorch/ATen
76
// RUN: mkdir dpct_out
8-
// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/ATen/user_defined_rule_pytorch.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_cuda_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src
7+
// RUN: dpct --out-root dpct_out %T/pytorch/ATen/src/ATen.cu --extra-arg="-I%T/pytorch/ATen/pytorch_inc" --cuda-include-path="%cuda-path/include" --rule-file=%S/../../../tools/dpct/DpctOptRules/pytorch_api.yaml --analysis-scope-path %T/pytorch/ATen/pytorch_inc --analysis-scope-path %T/pytorch/ATen/src --in-root %T/pytorch/ATen/src
98
// RUN: FileCheck --input-file %T/pytorch/ATen/dpct_out/ATen.dp.cpp --match-full-lines %T/pytorch/ATen/src/ATen.cu
10-
// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/pytorch/ATen/dpct_out/ATen.dp.cpp -o %T/pytorch/ATen/dpct_out/ATen.dp.o %}
119

12-
#ifndef NO_BUILD_TEST
10+
// CHECK: #include <c10/xpu/XPUStream.h>
1311
#include <iostream>
1412
// CHECK: #include <ATen/xpu/XPUContext.h>
1513
#include <ATen/cuda/CUDAContext.h>
1614
// CHECK: #include <ATen/core/Tensor.h>
1715
#include <ATen/core/Tensor.h>
1816

17+
// CHECK: #include <ATen/Tensor.h>
18+
// CHECK-NEXT: #include <c10/util/Half.h>
19+
#include <ATen/cuda/CUDATensorMethods.cuh>
20+
21+
#define AT_CUDA_CHECK(stmt) (stmt)
22+
23+
// CHECK: #define BE_AT_CHECK
24+
#define BE_AT_CHECK AT_CUDA_CHECK
25+
26+
27+
__global__ void kernel() {}
28+
29+
void test_CUDAStream_as_arg() {
30+
dim3 gridSize(2, 2, 1);
31+
dim3 blockSize(8, 8, 1);
32+
void *args[] = {nullptr};
33+
34+
// CHECK: ([&]() {
35+
// CHECK-NEXT: ((sycl::queue *)(c10::xpu::getCurrentXPUStream()))
36+
// CHECK-NEXT: ->parallel_for(sycl::nd_range<3>(gridSize * blockSize, blockSize),
37+
// CHECK-NEXT: [=](sycl::nd_item<3> item_ct1) {
38+
// CHECK-NEXT: kernel();
39+
// CHECK-NEXT: });
40+
// CHECK-NEXT: return 0;
41+
// CHECK-NEXT: }());
42+
AT_CUDA_CHECK(cudaLaunchKernel((const void *)kernel, gridSize, blockSize, args, 0, at::cuda::getCurrentCUDAStream()));
43+
}
44+
1945
int main() {
46+
// CHECK: dpct::queue_ptr st =
47+
// CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream());
48+
cudaStream_t st = 0;
49+
50+
// stream APIs
51+
at::DeviceIndex devInd = 1;
52+
53+
// CHECK: auto currentStream = c10::xpu::getCurrentXPUStream();
54+
auto currentStream = at::cuda::getCurrentCUDAStream();
55+
// CHECK: auto deviceStream = c10::xpu::getCurrentXPUStream(devInd);
56+
auto deviceStream = at::cuda::getCurrentCUDAStream(devInd);
57+
58+
// CHECK: dpct::queue_ptr curr_cuda_st =
59+
// CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream().queue());
60+
cudaStream_t curr_cuda_st = at::cuda::getCurrentCUDAStream().stream();
61+
// CHECK: dpct::queue_ptr dev_cuda_st = &static_cast<sycl::queue &>(
62+
// CHECK-NEXT: c10::xpu::getCurrentXPUStream(devInd).queue());
63+
cudaStream_t dev_cuda_st = at::cuda::getCurrentCUDAStream(devInd).stream();
64+
65+
test_CUDAStream_as_arg();
2066

2167
return 0;
2268
}
23-
#endif

clang/test/dpct/pytorch/c10.cu

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,56 @@
11
// RUN: rm -rf %T/pytorch/c10
22
// RUN: mkdir -p %T/pytorch/c10/src
33
// RUN: cp %S/c10.cu %T/pytorch/c10/src/
4-
// RUN: cp %S/user_defined_rule_pytorch.yaml %T/pytorch/c10/
5-
// RUN: cp -r %S/pytorch_cuda_inc %T/pytorch/c10/
4+
// RUN: cp -r %S/pytorch_inc %T/pytorch/c10/
65
// RUN: cd %T/pytorch/c10
76
// RUN: mkdir dpct_out
8-
// RUN: dpct -out-root dpct_out %T/pytorch/c10/src/c10.cu --extra-arg="-I%T/pytorch/c10/pytorch_cuda_inc" --cuda-include-path="%cuda-path/include" --rule-file=%T/pytorch/c10/user_defined_rule_pytorch.yaml -- -x cuda --cuda-host-only
7+
// RUN: dpct -out-root dpct_out %T/pytorch/c10/src/c10.cu --extra-arg="-I%T/pytorch/c10/pytorch_inc" --cuda-include-path="%cuda-path/include" --rule-file=%S/../../../tools/dpct/DpctOptRules/pytorch_api.yaml -- -x cuda --cuda-host-only
98
// RUN: FileCheck --input-file %T/pytorch/c10/dpct_out/c10.dp.cpp --match-full-lines %T/pytorch/c10/src/c10.cu
10-
// RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/pytorch/c10/dpct_out/c10.dp.cpp -o %T/pytorch/c10/dpct_out/c10.dp.o %}
119

12-
#ifndef NO_BUILD_TEST
1310
#include <iostream>
14-
// CHECK: #include <c10/xpu/XPUStream.h>
15-
#include <c10/cuda/CUDAStream.h>
1611
// CHECK: #include <c10/core/DeviceGuard.h>
1712
#include <c10/cuda/CUDAGuard.h>
13+
// CHECK: #include <c10/xpu/XPUStream.h>
14+
#include <c10/cuda/CUDAStream.h>
15+
// CHECK: #include <c10/xpu/XPUFunctions.h>
16+
#include <c10/cuda/CUDAFunctions.h>
17+
// CHECK: #include <c10/xpu/XPUMacros.h>
18+
#include <c10/cuda/CUDAMacros.h>
19+
20+
// CHECK: #define BE_BUILD_SHARED_LIBS C10_XPU_BUILD_SHARED_LIBS
21+
// CHECK-NEXT: #define BE_EXPORT C10_XPU_EXPORT
22+
// CHECK-NEXT: #define BE_IMPORT C10_XPU_IMPORT
23+
// CHECK-NEXT: #define BE_API C10_XPU_API
24+
// CHECK-NEXT: #define BE_BUILD_MAIN_LIB C10_XPU_BUILD_MAIN_LIB
25+
#define BE_BUILD_SHARED_LIBS C10_CUDA_BUILD_SHARED_LIBS
26+
#define BE_EXPORT C10_CUDA_EXPORT
27+
#define BE_IMPORT C10_CUDA_IMPORT
28+
#define BE_API C10_CUDA_API
29+
#define BE_BUILD_MAIN_LIB C10_CUDA_BUILD_MAIN_LIB
1830

1931
int main() {
20-
std::optional<c10::Device> device;
32+
// device APIs
33+
// CHECK: c10::DeviceIndex num_devices = c10::xpu::device_count();
34+
c10::DeviceIndex num_devices = c10::cuda::device_count();
35+
36+
// CHECK: c10::DeviceIndex num_devices_ensured =
37+
// CHECK-NEXT: c10::xpu::device_count_ensure_non_zero();
38+
c10::DeviceIndex num_devices_ensured = c10::cuda::device_count_ensure_non_zero();
2139

40+
// CHECK: c10::DeviceIndex current_device = c10::xpu::current_device();
41+
c10::DeviceIndex current_device = c10::cuda::current_device();
42+
43+
c10::DeviceIndex new_device = 1;
44+
// CHECK: c10::xpu::set_device(new_device);
45+
c10::cuda::set_device(new_device);
46+
47+
// CHECK: c10::DeviceIndex exchanged_device = c10::xpu::exchange_device(0);
48+
c10::DeviceIndex exchanged_device = c10::cuda::ExchangeDevice(0);
49+
50+
// CHECK: c10::DeviceIndex maybe_exchanged_device = c10::xpu::maybe_exchange_device(1);
51+
c10::DeviceIndex maybe_exchanged_device = c10::cuda::MaybeExchangeDevice(1);
52+
53+
std::optional<c10::Device> device;
2254
try {
2355
// CHECK: c10::OptionalDeviceGuard device_guard(device);
2456
c10::cuda::OptionalCUDAGuard device_guard(device);
@@ -27,18 +59,19 @@ int main() {
2759
return -1;
2860
}
2961

62+
// stream APIs
3063
// CHECK: auto currentStream = c10::xpu::getCurrentXPUStream();
3164
auto currentStream = c10::cuda::getCurrentCUDAStream();
3265

33-
// CHECK: dpct::queue_ptr curr_cuda_st = &(currentStream.queue());
34-
// CHECK-NEXT: curr_cuda_st = &(c10::xpu::getCurrentXPUStream().queue());
66+
// CHECK: dpct::queue_ptr curr_cuda_st =
67+
// CHECK-NEXT: &static_cast<sycl::queue &>(currentStream.queue());
3568
cudaStream_t curr_cuda_st = currentStream.stream();
69+
// CHECK: curr_cuda_st =
70+
// CHECK-NEXT: &static_cast<sycl::queue &>(c10::xpu::getCurrentXPUStream().queue());
3671
curr_cuda_st = c10::cuda::getCurrentCUDAStream().stream();
3772

3873
// CHECK: auto deviceStream = c10::xpu::getCurrentXPUStream(0);
3974
auto deviceStream = c10::cuda::getCurrentCUDAStream(0);
4075

4176
return 0;
4277
}
43-
44-
#endif

0 commit comments

Comments
 (0)