AliceO2Group
diff --git a/‎Common/ML/src/OrtInterface.cxx‎
Lines changed: 3 additions & 10 deletions b/‎Common/ML/src/OrtInterface.cxx‎
Lines changed: 3 additions & 10 deletions
diff --git a/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu‎
Lines changed: 27 additions & 21 deletions b/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu‎
Lines changed: 27 additions & 21 deletions
diff --git a/‎GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx‎
Lines changed: 7 additions & 43 deletions b/‎GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx‎
Lines changed: 7 additions & 43 deletions
@@ -140,6 +140,9 @@ void OrtModel::initEnvironment()
 
 void OrtModel::initSessionFromBuffer(const char* buffer, size_t bufferSize)
 {
+  if (mAllocateDeviceMemory) {
+    memoryOnDevice(mDeviceId);
+  }
   mPImplOrt->sessionOptions.AddConfigEntry("session.load_model_format", "ONNX");
   mPImplOrt->sessionOptions.AddConfigEntry("session.use_ort_model_bytes_directly", "1");
 
@@ -354,11 +357,6 @@ template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, int64_t, float*);
 template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, int64_t, OrtDataType::Float16_t*);
 template void OrtModel::inference<float, float>(float*, int64_t, float*);
-template void OrtModel::inference<int8_t, int8_t>(int8_t*, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, float>(int8_t*, int64_t, float*);
-template void OrtModel::inference<float, int8_t>(float*, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, OrtDataType::Float16_t>(int8_t*, int64_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<OrtDataType::Float16_t, int8_t>(OrtDataType::Float16_t*, int64_t, int8_t*);
 
 template <class I, class O>
 void OrtModel::inference(I** input, int64_t input_size, O* output)
@@ -419,11 +417,6 @@ template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t**, int64_t, float*);
 template void OrtModel::inference<float, OrtDataType::Float16_t>(float**, int64_t, OrtDataType::Float16_t*);
 template void OrtModel::inference<float, float>(float**, int64_t, float*);
-template void OrtModel::inference<int8_t, int8_t>(int8_t**, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, float>(int8_t**, int64_t, float*);
-template void OrtModel::inference<float, int8_t>(float**, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, OrtDataType::Float16_t>(int8_t**, int64_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<OrtDataType::Float16_t, int8_t>(OrtDataType::Float16_t**, int64_t, int8_t*);
 
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
 
@@ -631,34 +631,40 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
     }                                                 \
   }
 
-void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
-{
+void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) {
   GPUChkErr(cudaGetDevice(deviceId));
+
 #if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD)
   const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
-  OrtCUDAProviderOptionsV2* cuda_options = nullptr;
-  ORTCHK(api->CreateCUDAProviderOptions(&cuda_options));
 
-  // std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
-  // std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
-  // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
+#ifdef ORT_TENSORRT_BUILD
+  OrtTensorRTProviderOptionsV2* trtOptions = nullptr;
+  ORTCHK(api->CreateTensorRTProviderOptions(&trtOptions));
+
+  const std::string device = std::to_string(*deviceId);
+  const char* keys[] = {"device_id", "trt_int8_enable"};
+  const char* values[] = {device.c_str(), "1"};
 
-  // this implicitly sets "has_user_compute_stream"
-  ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]));
-  ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
+  ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions,keys,values,sizeof(keys) / sizeof(keys[0])));
+  ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions,"user_compute_stream",mInternals->Streams[stream]));
+  ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions,trtOptions)); // Register TensorRT first: it consequently has higher priority.
+  api->ReleaseTensorRTProviderOptions(trtOptions);
+#endif
+
+  // CUDA is the fallback for nodes unsupported by TensorRT.
+  OrtCUDAProviderOptionsV2* cudaOptions = nullptr;
+  ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions));
+  ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions,"user_compute_stream",mInternals->Streams[stream]));
+  ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions,cudaOptions));
+  api->ReleaseCUDAProviderOptions(cudaOptions);
 
-  // Finally, don't forget to release the provider options
-  api->ReleaseCUDAProviderOptions(cuda_options);
 #elif defined(ORT_ROCM_BUILD)
-  // const auto& api = Ort::GetApi();
-  // api.GetCurrentGpuDeviceId(deviceId);
-  OrtROCMProviderOptions rocm_options;
-  rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  rocm_options.arena_extend_strategy = 0;   // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
-  // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
-  rocm_options.user_compute_stream = mInternals->Streams[stream];
-  session_options.AppendExecutionProvider_ROCM(rocm_options);
-#endif // ORT_ROCM_BUILD
+  OrtROCMProviderOptions rocmOptions;
+  rocmOptions.has_user_compute_stream = 1;
+  rocmOptions.arena_extend_strategy = 0;
+  rocmOptions.user_compute_stream = mInternals->Streams[stream];
+  sessionOptions.AppendExecutionProvider_ROCM(rocmOptions);
+#endif
 }
 
 #ifndef __HIPCC__ // CUDA
 
@@ -1270,26 +1270,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_8);
                   }
                 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_8);
-                  }
-                } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
-                  if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_8);
                   }
                 }
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
@@ -1302,26 +1290,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_8);
                   }
-                } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
+                } else {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_8);
-                  }
-                } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
-                  if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_8);
                   }
                 }
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
@@ -1330,26 +1306,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   if (clustererNNShadow.mNnInferenceInputDType == 0) {
                     if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                    } else {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_8);
                     }
                   } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
                     if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                    } else {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_8);
-                    }
-                  } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
-                    if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_32);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_16);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_8);
                     }
                   }
                   if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }