Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Common/ML/src/OrtInterface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ void OrtModel::initEnvironment()

void OrtModel::initSessionFromBuffer(const char* buffer, size_t bufferSize)
{
if (mAllocateDeviceMemory) {
memoryOnDevice(mDeviceId);
}
mPImplOrt->sessionOptions.AddConfigEntry("session.load_model_format", "ONNX");
mPImplOrt->sessionOptions.AddConfigEntry("session.use_ort_model_bytes_directly", "1");

Expand Down
48 changes: 27 additions & 21 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -631,34 +631,40 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
} \
}

void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
{
void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) {
GPUChkErr(cudaGetDevice(deviceId));

#if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD)
const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
OrtCUDAProviderOptionsV2* cuda_options = nullptr;
ORTCHK(api->CreateCUDAProviderOptions(&cuda_options));

// std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
// std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
// UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
#ifdef ORT_TENSORRT_BUILD
OrtTensorRTProviderOptionsV2* trtOptions = nullptr;
ORTCHK(api->CreateTensorRTProviderOptions(&trtOptions));

const std::string device = std::to_string(*deviceId);
const char* keys[] = {"device_id", "trt_int8_enable"};
const char* values[] = {device.c_str(), "1"};

// this implicitly sets "has_user_compute_stream"
ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]));
ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions,keys,values,sizeof(keys) / sizeof(keys[0])));
ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions,"user_compute_stream",mInternals->Streams[stream]));
ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions,trtOptions)); // Register TensorRT first: it consequently has higher priority.
api->ReleaseTensorRTProviderOptions(trtOptions);
#endif

// CUDA is the fallback for nodes unsupported by TensorRT.
OrtCUDAProviderOptionsV2* cudaOptions = nullptr;
ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions));
ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions,"user_compute_stream",mInternals->Streams[stream]));
ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions,cudaOptions));
api->ReleaseCUDAProviderOptions(cudaOptions);

// Finally, don't forget to release the provider options
api->ReleaseCUDAProviderOptions(cuda_options);
#elif defined(ORT_ROCM_BUILD)
// const auto& api = Ort::GetApi();
// api.GetCurrentGpuDeviceId(deviceId);
OrtROCMProviderOptions rocm_options;
rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
// rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
rocm_options.user_compute_stream = mInternals->Streams[stream];
session_options.AppendExecutionProvider_ROCM(rocm_options);
#endif // ORT_ROCM_BUILD
OrtROCMProviderOptions rocmOptions;
rocmOptions.has_user_compute_stream = 1;
rocmOptions.arena_extend_strategy = 0;
rocmOptions.user_compute_stream = mInternals->Streams[stream];
sessionOptions.AppendExecutionProvider_ROCM(rocmOptions);
#endif
}

#ifndef __HIPCC__ // CUDA
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishe
AddOption(nnCCDBRegressionLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
AddOption(nnCCDBBeamType, std::string, "pp", "", 0, "Distinguishes between networks trained for different beam types. Options: pp, pPb, PbPb")
AddOption(nnCCDBInteractionRate, std::string, "500", "", 0, "Distinguishes between networks for different interaction rates [kHz].")
AddOption(nnCCDBExtraMetadata, std::string, "", "", 0, "Extra metadata to distinguish between networks, e.g. for different internal datatypes, etc.")
AddHelp("help", 'h')
EndConfig()

Expand Down
38 changes: 19 additions & 19 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1269,15 +1269,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); }
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
} else {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
}
} else if (clustererNNShadow.mNnInferenceInputDType == 1) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
} else {
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
}
}
if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
Expand All @@ -1289,31 +1289,31 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); }
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
} else {
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
}
} else if (clustererNNShadow.mNnInferenceInputDType == 1) {
} else {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
} else {
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
}
}
if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); }
if (clustererNNShadow.mNnInferenceInputDType == 0) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
} else {
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
}
} else if (clustererNNShadow.mNnInferenceInputDType == 1) {
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
} else {
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
}
}
if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
Expand Down
Loading
Loading