From e7e21b4d3ab49f6f7bbf2863ec81c61378e0143e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 30 Jun 2026 09:28:13 +0200
Subject: [PATCH 1/8] Adding support for int8 inputs

---
 Common/ML/src/OrtInterface.cxx                |  10 ++
 GPU/GPUTracking/Definitions/GPUSettingsList.h |   1 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  60 ++++++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  86 +++++++----
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   6 +
 .../GPUTPCNNClusterizerHost.cxx               |  21 ++-
 .../GPUTPCNNClusterizerKernels.cxx            | 135 +++++++++++++-----
 GPU/Workflow/src/GPUWorkflowSpec.cxx          |   1 +
 8 files changed, 242 insertions(+), 78 deletions(-)
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 8f88ab18dacbd..7e3404a288447 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -354,6 +354,11 @@ template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, int64_t, float*);
 template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, int64_t, OrtDataType::Float16_t*);
 template void OrtModel::inference<float, float>(float*, int64_t, float*);
+template void OrtModel::inference<int8_t, int8_t>(int8_t*, int64_t, int8_t*);
+template void OrtModel::inference<int8_t, float>(int8_t*, int64_t, float*);
+template void OrtModel::inference<float, int8_t>(float*, int64_t, int8_t*);
+template void OrtModel::inference<int8_t, OrtDataType::Float16_t>(int8_t*, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<OrtDataType::Float16_t, int8_t>(OrtDataType::Float16_t*, int64_t, int8_t*);
 
 template <class I, class O>
 void OrtModel::inference(I** input, int64_t input_size, O* output)
@@ -414,6 +419,11 @@ template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t**, int64_t, float*);
 template void OrtModel::inference<float, OrtDataType::Float16_t>(float**, int64_t, OrtDataType::Float16_t*);
 template void OrtModel::inference<float, float>(float**, int64_t, float*);
+template void OrtModel::inference<int8_t, int8_t>(int8_t**, int64_t, int8_t*);
+template void OrtModel::inference<int8_t, float>(int8_t**, int64_t, float*);
+template void OrtModel::inference<float, int8_t>(float**, int64_t, int8_t*);
+template void OrtModel::inference<int8_t, OrtDataType::Float16_t>(int8_t**, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<OrtDataType::Float16_t, int8_t>(OrtDataType::Float16_t**, int64_t, int8_t*);
 
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index e3327f2f8b661..40f7a34e699c9 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -300,6 +300,7 @@ AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishe
 AddOption(nnCCDBRegressionLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
 AddOption(nnCCDBBeamType, std::string, "pp", "", 0, "Distinguishes between networks trained for different beam types. Options: pp, pPb, PbPb")
 AddOption(nnCCDBInteractionRate, std::string, "500", "", 0, "Distinguishes between networks for different interaction rates [kHz].")
+AddOption(nnCCDBExtraMetadata, std::string, "", "", 0, "Extra metadata to distinguish between networks, e.g. for different internal datatypes, etc.")
 AddHelp("help", 'h')
 EndConfig()
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 76dcd54a89289..76f4c6479287e 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1269,15 +1269,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); }
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
                   } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_8);
                   }
                 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
                   } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_8);
+                  }
+                } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
+                  if (clustererNNShadow.mNnInferenceOutputDType == 0) {
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_32);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_16);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_8);
                   }
                 }
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
@@ -1289,15 +1301,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); }
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
                   } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_8);
                   }
                 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
                   } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_8);
+                  }
+                } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
+                  if (clustererNNShadow.mNnInferenceOutputDType == 0) {
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_32);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_16);
+                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_8);
                   }
                 }
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
@@ -1305,15 +1329,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); }
                   if (clustererNNShadow.mNnInferenceInputDType == 0) {
                     if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
                     } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
+                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_8);
                     }
                   } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
                     if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
                     } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
+                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_8);
+                    }
+                  } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
+                    if (clustererNNShadow.mNnInferenceOutputDType == 0) {
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_32);
+                    } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_16);
+                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
+                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_8);
                     }
                   }
                   if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 6fac0e417ac26..3decd231aec62 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -30,13 +30,27 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   void* startMem = mem;
   if (mNnClusterizerBatchedMode > 0) {
     if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) {
-      computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
-    } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, mInputData_32, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
+    } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
+    } else if (mNnInferenceInputDType == 2 && mNnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, mInputData_8, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
     }
     computePointerWithAlignment(mem, mClusterFlags, 2 * mNnClusterizerBatchedMode);
 
     if (mNnInferenceOutputDType == 0 && mNnClusterizerElementSize > 0) {
+      if (mNnClusterizerModelClassNumOutputNodes > 0) {
+        computePointerWithAlignment(mem, mModelProbabilities_32, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes);
+      }
+      if (!mNnClusterizerUseCfRegression) {
+        if (mNnClusterizerModelReg1NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, mOutputDataReg1_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes);
+        }
+        if (mNnClusterizerModelReg2NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, mOutputDataReg2_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes);
+        }
+      }
+    } else if (mNnInferenceOutputDType == 1 && mNnClusterizerElementSize > 0) {
       if (mNnClusterizerModelClassNumOutputNodes > 0) {
         computePointerWithAlignment(mem, mModelProbabilities_16, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes);
       }
@@ -48,16 +62,16 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
           computePointerWithAlignment(mem, mOutputDataReg2_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes);
         }
       }
-    } else if (mNnInferenceOutputDType == 1 && mNnClusterizerElementSize > 0) {
+    } else if (mNnInferenceOutputDType == 2 && mNnClusterizerElementSize > 0) {
       if (mNnClusterizerModelClassNumOutputNodes > 0) {
-        computePointerWithAlignment(mem, mModelProbabilities_32, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes);
+        computePointerWithAlignment(mem, mModelProbabilities_8, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes);
       }
       if (!mNnClusterizerUseCfRegression) {
         if (mNnClusterizerModelReg1NumOutputNodes > 0) {
-          computePointerWithAlignment(mem, mOutputDataReg1_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes);
+          computePointerWithAlignment(mem, mOutputDataReg1_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes);
         }
         if (mNnClusterizerModelReg2NumOutputNodes > 0) {
-          computePointerWithAlignment(mem, mOutputDataReg2_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes);
+          computePointerWithAlignment(mem, mOutputDataReg2_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes);
         }
       }
     }
@@ -78,26 +92,34 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 
       // Element counts (number of array entries, not bytes)
       size_t elemsClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode : 0;
-      size_t elemsInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
-      size_t elemsInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
-      size_t elemsProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
-      size_t elemsProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
-      size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
-      size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
+      size_t elemsInput_32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
+      size_t elemsInput_16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
+      size_t elemsInput_8 = (mInputData_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
+      size_t elemsProb_32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
+      size_t elemsProb_16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
+      size_t elemsProb_8 = (mModelProbabilities_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
       size_t elemsReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
+      size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
+      size_t elemsReg1_8 = (mOutputDataReg1_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
       size_t elemsReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
+      size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
+      size_t elemsReg2_8 = (mOutputDataReg2_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
       size_t elemsOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters : 0;
 
       // Byte sizes
       size_t szClusterFlags = elemsClusterFlags * sizeof(int8_t);
-      size_t szInput16 = elemsInput16 * sizeof(OrtDataType::Float16_t);
-      size_t szInput32 = elemsInput32 * sizeof(float);
-      size_t szProb16 = elemsProb16 * sizeof(OrtDataType::Float16_t);
-      size_t szProb32 = elemsProb32 * sizeof(float);
-      size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t);
-      size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t);
+      size_t szInput_32 = elemsInput_32 * sizeof(float);
+      size_t szInput_16 = elemsInput_16 * sizeof(OrtDataType::Float16_t);
+      size_t szInput_8 = elemsInput_8 * sizeof(int8_t);
+      size_t szProb_32 = elemsProb_32 * sizeof(float);
+      size_t szProb_16 = elemsProb_16 * sizeof(OrtDataType::Float16_t);
+      size_t szProb_8 = elemsProb_8 * sizeof(int8_t);
       size_t szReg1_32 = elemsReg1_32 * sizeof(float);
+      size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t);
+      size_t szReg1_8 = elemsReg1_8 * sizeof(int8_t);
       size_t szReg2_32 = elemsReg2_32 * sizeof(float);
+      size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t);
+      size_t szReg2_8 = elemsReg2_8 * sizeof(int8_t);
       size_t szOutputDataClass = elemsOutputDataClass * sizeof(int32_t);
 
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector;
@@ -107,12 +129,24 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags)
                 << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)"
                 << " | " << fmt(szClusterFlags);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_8 pointer: " << mInputData_8
+                << " | elements=" << elemsInput_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
+                << " | " << fmt(szInput_8);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_8 pointer: " << mModelProbabilities_8
+                << " | elements=" << elemsProb_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
+                << " | " << fmt(szProb_8);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_8 pointer: " << mOutputDataReg1_8
+                << " | elements=" << elemsReg1_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
+                << " | " << fmt(szReg1_8);
+      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_8 pointer: " << mOutputDataReg2_8
+                << " | elements=" << elemsReg2_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
+                << " | " << fmt(szReg2_8);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16
-                << " | elements=" << elemsInput16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
-                << " | " << fmt(szInput16);
+                << " | elements=" << elemsInput_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
+                << " | " << fmt(szInput_16);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16
-                << " | elements=" << elemsProb16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
-                << " | " << fmt(szProb16);
+                << " | elements=" << elemsProb_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
+                << " | " << fmt(szProb_16);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16
                 << " | elements=" << elemsReg1_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
                 << " | " << fmt(szReg1_16);
@@ -120,11 +154,11 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
                 << " | elements=" << elemsReg2_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
                 << " | " << fmt(szReg2_16);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32
-                << " | elements=" << elemsInput32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
-                << " | " << fmt(szInput32);
+                << " | elements=" << elemsInput_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
+                << " | " << fmt(szInput_32);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32
-                << " | elements=" << elemsProb32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
-                << " | " << fmt(szProb32);
+                << " | elements=" << elemsProb_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
+                << " | " << fmt(szProb_32);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32
                 << " | elements=" << elemsReg1_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
                 << " | " << fmt(szReg1_32);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index b7bc1575d349a..9ef1a658a9c3a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -101,6 +101,12 @@ class GPUTPCNNClusterizer : public GPUProcessor
   OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr;
   OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr;
 
+  // INT8
+  int8_t* mInputData_8 = nullptr;
+  int8_t* mModelProbabilities_8 = nullptr;
+  int8_t* mOutputDataReg1_8 = nullptr;
+  int8_t* mOutputDataReg2_8 = nullptr;
+
   int16_t mMemoryId = -1;
 }; // class GPUTPCNNClusterizer
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 77d5ee13f85fb..f4f52d7ec3298 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -123,8 +123,25 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   } else {
     clustererNN.mNnClusterizerVerbosity = settings.nnClusterizerVerbosity;
   }
-  clustererNN.mNnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
-  clustererNN.mNnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
+  // Define the datatype for input and output
+  if (settings.nnInferenceInputDType.find("32") != std::string::npos) {
+    clustererNN.mNnInferenceInputDType = 0;
+  } else if (settings.nnInferenceInputDType.find("16") != std::string::npos) {
+    clustererNN.mNnInferenceInputDType = 1;
+  } else if (settings.nnInferenceInputDType.find("8") != std::string::npos) {
+    clustererNN.mNnInferenceInputDType = 2;
+  } else {
+    clustererNN.mNnInferenceInputDType = 1; // Default to float32
+  }
+  if (settings.nnInferenceOutputDType.find("32") != std::string::npos) {
+    clustererNN.mNnInferenceOutputDType = 0;
+  } else if (settings.nnInferenceOutputDType.find("16") != std::string::npos) {
+    clustererNN.mNnInferenceOutputDType = 1;
+  } else if (settings.nnInferenceOutputDType.find("8") != std::string::npos) {
+    clustererNN.mNnInferenceOutputDType = 2;
+  } else {
+    clustererNN.mNnInferenceOutputDType = 1; // Default to float32
+  }
   clustererNN.mNnClusterizerModelClassNumOutputNodes = mModelClass.getNumOutputNodes()[0][1];
   if (!settings.nnClusterizerUseCfRegression) {
     if (mModelClass.getNumOutputNodes()[0][1] == 1 || !mModelReg2.isInitialized()) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index ee0fa217b8095..a33a9eaed58ef 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -92,17 +92,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
           // Fill boundary value
           output_value = clustererNN.mNnClusterizerBoundaryFillValue;
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
-          } else {
             clustererNN.mInputData_32[write_idx] = output_value;
+          } else if (dtype == 1) {
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
+          } else if (dtype == 2) {
+            clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
           }
         } else {
           CfChargePos tmp_pos(target_row, target_pad, target_time);
           output_value = chargeMap[tmp_pos].unpack() * inverse_charge;
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
-          } else {
             clustererNN.mInputData_32[write_idx] = output_value;
+          } else if (dtype == 1) {
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
+          } else if (dtype == 2) {
+            clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
           }
         }
         // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -116,13 +120,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 
   if (clustererNN.mNnClusterizerAddIndexData) {
     if (dtype == 0) {
-      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
-      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
-    } else {
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
       clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
+    } else if (dtype == 1) {
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
+      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
+    } else if (dtype == 2) {
+      clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
+      clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
+      clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast<float>(pad) / npads_row);
     }
   }
 
@@ -142,6 +150,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
+  // Statically quantized S8S8 ONNX models with graph-contained scaling expose
+  // FP32 graph boundaries and must use dtype == 0 here. Their QuantizeLinear
+  // nodes execute on the ONNX GPU stream. dtype == 2 is reserved for models
+  // with true external INT8 I/O and requires matching external scale metadata.
   const uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
@@ -173,13 +185,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
     const int32_t npads = GPUTPCGeometry::NPads(row);
     if (dtype == 0) {
-      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
-      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads);
-    } else {
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
       clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads;
+    } else if (dtype == 1) {
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
+      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads);
+    } else if (dtype == 2) {
+      clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
+      clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
+      clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast<float>(pad) / npads);
     }
   }
 
@@ -197,9 +213,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
       for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
         if (dtype == 0) {
+          clustererNN.mInputData_32[write_idx] = output_value;
+        } else if (dtype == 1) {
           clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {
-          clustererNN.mInputData_32[write_idx] = output_value;
+          clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
         }
         write_idx += clustererNN.mNnClusterizerFullTimeSize;
       }
@@ -224,9 +242,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
           output_value = chargeMap[pos].unpack() * inverse_central_charge;
         }
         if (dtype == 0) {
+          clustererNN.mInputData_32[write_idx] = output_value;
+        } else if (dtype == 1) {
           clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {
-          clustererNN.mInputData_32[write_idx] = output_value;
+          clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
         }
         write_idx += clustererNN.mNnClusterizerFullTimeSize;
       }
@@ -246,9 +266,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   }
   if (clustererNN.mNnClusterizerUseClassification) {
     if (dtype == 0) {
-      clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
-    } else if (dtype == 1) {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_32[glo_idx] > clustererNN.mNnClassThreshold);
+    } else if (dtype == 1) {
+      clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
+    } else if (dtype == 2) {
+      clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_8[glo_idx] > clustererNN.mNnClassThreshold);
     }
   } else {
     clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
@@ -271,15 +293,19 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
     for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
       if (pIdx == elem_iterator) {
         if (dtype == 0) {
-          current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
-        } else if (dtype == 1) {
           current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
+        } else if (dtype == 1) {
+          current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
+        } else if (dtype == 2) {
+          current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_8[pIdx]);
         }
       } else {
         if (dtype == 0) {
-          current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
-        } else if (dtype == 1) {
           current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
+        } else if (dtype == 1) {
+          current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
+        } else if (dtype == 2) {
+          current_max_prob = CAMath::Max(current_max_prob, static_cast<float>(clustererNN.mModelProbabilities_8[pIdx]));
         }
       }
     }
@@ -368,6 +394,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   float publishPadPosition = 0.f, publishTimePosition = 0.f;
   if (dtype == 0) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1];
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
+    pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
+               publishPadPosition,
+               notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
+               notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  } else if (dtype == 1) {
     publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat();
     publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat();
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
@@ -378,15 +415,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else {
-    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index];
-    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1];
+  } else if (dtype == 2) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_8[model_output_index];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_8[model_output_index + 1];
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
-    pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
+    pc.setFull(central_charge * clustererNN.mOutputDataReg1_8[model_output_index + 4],
                publishPadPosition,
-               notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
+               notSinglePad ? clustererNN.mOutputDataReg1_8[model_output_index + 2] : 0.f,
                (clusterer.mPmemory->fragment).start + publishTimePosition,
-               notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
+               notSingleTime ? clustererNN.mOutputDataReg1_8[model_output_index + 3] : 0.f,
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
@@ -554,6 +591,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   // Cluster 1
   float publishPadPosition = 0.f, publishTimePosition = 0.f;
   if (dtype == 0) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
+               publishPadPosition,
+               clustererNN.mOutputDataReg2_32[model_output_index + 4],
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
+               clustererNN.mOutputDataReg2_32[model_output_index + 6],
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  } else if (dtype == 1) {
     publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat();
     publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
@@ -564,15 +612,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 1) {
-    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index];
-    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
+  } else if (dtype == 2) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 1];
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
-    pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 8],
                publishPadPosition,
-               clustererNN.mOutputDataReg2_32[model_output_index + 4],
+               clustererNN.mOutputDataReg2_8[model_output_index + 4],
                (clusterer.mPmemory->fragment).start + publishTimePosition,
-               clustererNN.mOutputDataReg2_32[model_output_index + 6],
+               clustererNN.mOutputDataReg2_8[model_output_index + 6],
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
@@ -608,6 +656,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   // Cluster 2
   if (dtype == 0) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3];
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
+               publishPadPosition,
+               clustererNN.mOutputDataReg2_32[model_output_index + 5],
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
+               clustererNN.mOutputDataReg2_32[model_output_index + 7],
+               clustererNN.mClusterFlags[2 * glo_idx],
+               clustererNN.mClusterFlags[2 * glo_idx + 1]);
+  } else if (dtype == 1) {
     publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
     publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat();
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
@@ -618,15 +677,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 1) {
-    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
-    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3];
+  } else if (dtype == 2) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index + 1];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 3];
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
-    pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
+    pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 9],
                publishPadPosition,
-               clustererNN.mOutputDataReg2_32[model_output_index + 5],
+               clustererNN.mOutputDataReg2_8[model_output_index + 5],
                (clusterer.mPmemory->fragment).start + publishTimePosition,
-               clustererNN.mOutputDataReg2_32[model_output_index + 7],
+               clustererNN.mOutputDataReg2_8[model_output_index + 7],
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index e54d1dadb6ce2..d928cfdd502c9 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -1263,6 +1263,7 @@ Inputs GPURecoWorkflowSpec::inputs()
     metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN
     metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate;   // in kHz
     metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType;                 // pp, pPb, PbPb
+    metadata["nnCCDBExtraMetadata"] = nnClusterizerSettings.nnCCDBExtraMetadata;       // Extra metadata for CCDB
 
     auto convert_map_to_metadata = [](const std::map<std::string, std::string>& inputMap, std::vector<o2::framework::CCDBMetadata>& outputMetadata) {
       for (const auto& [key, value] : inputMap) {

From f09b710dcffc1b94d09dac64b2894b4214059ba6 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 30 Jun 2026 13:10:33 +0200
Subject: [PATCH 2/8] Reverting int8 changes as int8 computations are taken
 care of in the ONNX graph due to requirement of additional scaling

---
 Common/ML/src/OrtInterface.cxx                | 13 +--
 .../Base/cuda/GPUReconstructionCUDA.cu        | 48 ++++++-----
 .../Global/GPUChainTrackingClusterizer.cxx    | 50 ++----------
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 34 --------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  8 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 79 +++----------------
 6 files changed, 50 insertions(+), 182 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 7e3404a288447..9eccb9638d882 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -140,6 +140,9 @@ void OrtModel::initEnvironment()
 
 void OrtModel::initSessionFromBuffer(const char* buffer, size_t bufferSize)
 {
+  if (mAllocateDeviceMemory) {
+    memoryOnDevice(mDeviceId);
+  }
   mPImplOrt->sessionOptions.AddConfigEntry("session.load_model_format", "ONNX");
   mPImplOrt->sessionOptions.AddConfigEntry("session.use_ort_model_bytes_directly", "1");
 
@@ -354,11 +357,6 @@ template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, int64_t, float*);
 template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, int64_t, OrtDataType::Float16_t*);
 template void OrtModel::inference<float, float>(float*, int64_t, float*);
-template void OrtModel::inference<int8_t, int8_t>(int8_t*, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, float>(int8_t*, int64_t, float*);
-template void OrtModel::inference<float, int8_t>(float*, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, OrtDataType::Float16_t>(int8_t*, int64_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<OrtDataType::Float16_t, int8_t>(OrtDataType::Float16_t*, int64_t, int8_t*);
 
 template <class I, class O>
 void OrtModel::inference(I** input, int64_t input_size, O* output)
@@ -419,11 +417,6 @@ template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t**, int64_t, float*);
 template void OrtModel::inference<float, OrtDataType::Float16_t>(float**, int64_t, OrtDataType::Float16_t*);
 template void OrtModel::inference<float, float>(float**, int64_t, float*);
-template void OrtModel::inference<int8_t, int8_t>(int8_t**, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, float>(int8_t**, int64_t, float*);
-template void OrtModel::inference<float, int8_t>(float**, int64_t, int8_t*);
-template void OrtModel::inference<int8_t, OrtDataType::Float16_t>(int8_t**, int64_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<OrtDataType::Float16_t, int8_t>(OrtDataType::Float16_t**, int64_t, int8_t*);
 
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 040a4b84a0f64..23847fad18c9f 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -631,34 +631,40 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
     }                                                 \
   }
 
-void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
-{
+void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) {
   GPUChkErr(cudaGetDevice(deviceId));
+
 #if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD)
   const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
-  OrtCUDAProviderOptionsV2* cuda_options = nullptr;
-  ORTCHK(api->CreateCUDAProviderOptions(&cuda_options));
 
-  // std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
-  // std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
-  // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
+#ifdef ORT_TENSORRT_BUILD
+  OrtTensorRTProviderOptionsV2* trtOptions = nullptr;
+  ORTCHK(api->CreateTensorRTProviderOptions(&trtOptions));
+
+  const std::string device = std::to_string(*deviceId);
+  const char* keys[] = {"device_id", "trt_int8_enable"};
+  const char* values[] = {device.c_str(), "1"};
 
-  // this implicitly sets "has_user_compute_stream"
-  ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]));
-  ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
+  ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions,keys,values,sizeof(keys) / sizeof(keys[0])));
+  ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions,"user_compute_stream",mInternals->Streams[stream]));
+  ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions,trtOptions)); // Register TensorRT first: it consequently has higher priority.
+  api->ReleaseTensorRTProviderOptions(trtOptions);
+#endif
+
+  // CUDA is the fallback for nodes unsupported by TensorRT.
+  OrtCUDAProviderOptionsV2* cudaOptions = nullptr;
+  ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions));
+  ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions,"user_compute_stream",mInternals->Streams[stream]));
+  ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions,cudaOptions));
+  api->ReleaseCUDAProviderOptions(cudaOptions);
 
-  // Finally, don't forget to release the provider options
-  api->ReleaseCUDAProviderOptions(cuda_options);
 #elif defined(ORT_ROCM_BUILD)
-  // const auto& api = Ort::GetApi();
-  // api.GetCurrentGpuDeviceId(deviceId);
-  OrtROCMProviderOptions rocm_options;
-  rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  rocm_options.arena_extend_strategy = 0;   // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
-  // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
-  rocm_options.user_compute_stream = mInternals->Streams[stream];
-  session_options.AppendExecutionProvider_ROCM(rocm_options);
-#endif // ORT_ROCM_BUILD
+  OrtROCMProviderOptions rocmOptions;
+  rocmOptions.has_user_compute_stream = 1;
+  rocmOptions.arena_extend_strategy = 0;
+  rocmOptions.user_compute_stream = mInternals->Streams[stream];
+  sessionOptions.AppendExecutionProvider_ROCM(rocmOptions);
+#endif
 }
 
 #ifndef __HIPCC__ // CUDA
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 76f4c6479287e..462b7798ce337 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1270,26 +1270,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_8);
                   }
                 } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_8);
-                  }
-                } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
-                  if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mModelProbabilities_8);
                   }
                 }
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
@@ -1302,26 +1290,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_8);
                   }
-                } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
+                } else {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                  } else {
                     (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_8);
-                  }
-                } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
-                  if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_32);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_16);
-                  } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                    (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg1_8);
                   }
                 }
                 if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
@@ -1330,26 +1306,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   if (clustererNNShadow.mNnInferenceInputDType == 0) {
                     if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                    } else {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_8);
                     }
                   } else if (clustererNNShadow.mNnInferenceInputDType == 1) {
                     if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
+                    } else {
                       (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_8);
-                    }
-                  } else if (clustererNNShadow.mNnInferenceInputDType == 2) {
-                    if (clustererNNShadow.mNnInferenceOutputDType == 0) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_32);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_16);
-                    } else if (clustererNNShadow.mNnInferenceOutputDType == 2) {
-                      (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_8, iSize, clustererNNShadow.mOutputDataReg2_8);
                     }
                   }
                   if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 3decd231aec62..0e77393be1ce3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -33,8 +33,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
       computePointerWithAlignment(mem, mInputData_32, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
     } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
-    } else if (mNnInferenceInputDType == 2 && mNnClusterizerElementSize > 0) {
-      computePointerWithAlignment(mem, mInputData_8, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
     }
     computePointerWithAlignment(mem, mClusterFlags, 2 * mNnClusterizerBatchedMode);
 
@@ -62,18 +60,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
           computePointerWithAlignment(mem, mOutputDataReg2_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes);
         }
       }
-    } else if (mNnInferenceOutputDType == 2 && mNnClusterizerElementSize > 0) {
-      if (mNnClusterizerModelClassNumOutputNodes > 0) {
-        computePointerWithAlignment(mem, mModelProbabilities_8, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes);
-      }
-      if (!mNnClusterizerUseCfRegression) {
-        if (mNnClusterizerModelReg1NumOutputNodes > 0) {
-          computePointerWithAlignment(mem, mOutputDataReg1_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes);
-        }
-        if (mNnClusterizerModelReg2NumOutputNodes > 0) {
-          computePointerWithAlignment(mem, mOutputDataReg2_8, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes);
-        }
-      }
     }
   }
   if (mNnClusterizerTotalClusters > 0) {
@@ -94,32 +80,24 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
       size_t elemsClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode : 0;
       size_t elemsInput_32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
       size_t elemsInput_16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
-      size_t elemsInput_8 = (mInputData_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
       size_t elemsProb_32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
       size_t elemsProb_16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
-      size_t elemsProb_8 = (mModelProbabilities_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
       size_t elemsReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
       size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
-      size_t elemsReg1_8 = (mOutputDataReg1_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
       size_t elemsReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
       size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
-      size_t elemsReg2_8 = (mOutputDataReg2_8 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
       size_t elemsOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters : 0;
 
       // Byte sizes
       size_t szClusterFlags = elemsClusterFlags * sizeof(int8_t);
       size_t szInput_32 = elemsInput_32 * sizeof(float);
       size_t szInput_16 = elemsInput_16 * sizeof(OrtDataType::Float16_t);
-      size_t szInput_8 = elemsInput_8 * sizeof(int8_t);
       size_t szProb_32 = elemsProb_32 * sizeof(float);
       size_t szProb_16 = elemsProb_16 * sizeof(OrtDataType::Float16_t);
-      size_t szProb_8 = elemsProb_8 * sizeof(int8_t);
       size_t szReg1_32 = elemsReg1_32 * sizeof(float);
       size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t);
-      size_t szReg1_8 = elemsReg1_8 * sizeof(int8_t);
       size_t szReg2_32 = elemsReg2_32 * sizeof(float);
       size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t);
-      size_t szReg2_8 = elemsReg2_8 * sizeof(int8_t);
       size_t szOutputDataClass = elemsOutputDataClass * sizeof(int32_t);
 
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector;
@@ -129,18 +107,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags)
                 << " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)"
                 << " | " << fmt(szClusterFlags);
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_8 pointer: " << mInputData_8
-                << " | elements=" << elemsInput_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
-                << " | " << fmt(szInput_8);
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_8 pointer: " << mModelProbabilities_8
-                << " | elements=" << elemsProb_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
-                << " | " << fmt(szProb_8);
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_8 pointer: " << mOutputDataReg1_8
-                << " | elements=" << elemsReg1_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
-                << " | " << fmt(szReg1_8);
-      LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_8 pointer: " << mOutputDataReg2_8
-                << " | elements=" << elemsReg2_8 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
-                << " | " << fmt(szReg2_8);
       LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16
                 << " | elements=" << elemsInput_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
                 << " | " << fmt(szInput_16);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 9ef1a658a9c3a..7aa23489eb2f4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -89,7 +89,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
   int32_t* mOutputDataClass = nullptr;
 
-  // FP32
+  // FP32, also used for int8 models
   float* mInputData_32 = nullptr;
   float* mModelProbabilities_32 = nullptr;
   float* mOutputDataReg1_32 = nullptr;
@@ -101,12 +101,6 @@ class GPUTPCNNClusterizer : public GPUProcessor
   OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr;
   OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr;
 
-  // INT8
-  int8_t* mInputData_8 = nullptr;
-  int8_t* mModelProbabilities_8 = nullptr;
-  int8_t* mOutputDataReg1_8 = nullptr;
-  int8_t* mOutputDataReg2_8 = nullptr;
-
   int16_t mMemoryId = -1;
 }; // class GPUTPCNNClusterizer
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index a33a9eaed58ef..882a1e684d746 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -93,20 +93,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
           output_value = clustererNN.mNnClusterizerBoundaryFillValue;
           if (dtype == 0) {
             clustererNN.mInputData_32[write_idx] = output_value;
-          } else if (dtype == 1) {
+          } else {
             clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
-          } else if (dtype == 2) {
-            clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
           }
         } else {
           CfChargePos tmp_pos(target_row, target_pad, target_time);
           output_value = chargeMap[tmp_pos].unpack() * inverse_charge;
           if (dtype == 0) {
             clustererNN.mInputData_32[write_idx] = output_value;
-          } else if (dtype == 1) {
+          } else {
             clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
-          } else if (dtype == 2) {
-            clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
           }
         }
         // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -123,14 +119,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
       clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
-    } else if (dtype == 1) {
+    } else {
       clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
       clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
       clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
-    } else if (dtype == 2) {
-      clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
-      clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast<float>(pad) / npads_row);
     }
   }
 
@@ -188,14 +180,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
       clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads;
-    } else if (dtype == 1) {
+    } else {
       clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
       clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
       clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads);
-    } else if (dtype == 2) {
-      clustererNN.mInputData_8[write_idx] = (int8_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
-      clustererNN.mInputData_8[write_idx + 1] = (int8_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_8[write_idx + 2] = (int8_t)(static_cast<float>(pad) / npads);
     }
   }
 
@@ -214,10 +202,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
         if (dtype == 0) {
           clustererNN.mInputData_32[write_idx] = output_value;
-        } else if (dtype == 1) {
-          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {
-          clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         }
         write_idx += clustererNN.mNnClusterizerFullTimeSize;
       }
@@ -243,10 +229,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
         }
         if (dtype == 0) {
           clustererNN.mInputData_32[write_idx] = output_value;
-        } else if (dtype == 1) {
-          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {
-          clustererNN.mInputData_8[write_idx] = (int8_t)output_value;
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         }
         write_idx += clustererNN.mNnClusterizerFullTimeSize;
       }
@@ -267,10 +251,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (clustererNN.mNnClusterizerUseClassification) {
     if (dtype == 0) {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_32[glo_idx] > clustererNN.mNnClassThreshold);
-    } else if (dtype == 1) {
+    } else {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
-    } else if (dtype == 2) {
-      clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)(clustererNN.mModelProbabilities_8[glo_idx] > clustererNN.mNnClassThreshold);
     }
   } else {
     clustererNN.mOutputDataClass[glo_idx + batchStart] = 1;
@@ -294,18 +276,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
       if (pIdx == elem_iterator) {
         if (dtype == 0) {
           current_max_prob = clustererNN.mModelProbabilities_32[pIdx];
-        } else if (dtype == 1) {
+        } else {
           current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
-        } else if (dtype == 2) {
-          current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_8[pIdx]);
         }
       } else {
         if (dtype == 0) {
           current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]);
-        } else if (dtype == 1) {
+        } else {
           current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat());
-        } else if (dtype == 2) {
-          current_max_prob = CAMath::Max(current_max_prob, static_cast<float>(clustererNN.mModelProbabilities_8[pIdx]));
         }
       }
     }
@@ -404,7 +382,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 1) {
+  } else {
     publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat();
     publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat();
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
@@ -415,17 +393,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 2) {
-    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_8[model_output_index];
-    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_8[model_output_index + 1];
-    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
-    pc.setFull(central_charge * clustererNN.mOutputDataReg1_8[model_output_index + 4],
-               publishPadPosition,
-               notSinglePad ? clustererNN.mOutputDataReg1_8[model_output_index + 2] : 0.f,
-               (clusterer.mPmemory->fragment).start + publishTimePosition,
-               notSingleTime ? clustererNN.mOutputDataReg1_8[model_output_index + 3] : 0.f,
-               clustererNN.mClusterFlags[2 * glo_idx],
-               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
 
   // if (boundaryFlag != 0) { // Prints the entire NN input for the given index
@@ -601,7 +568,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                clustererNN.mOutputDataReg2_32[model_output_index + 6],
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 1) {
+  } else {
     publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat();
     publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
@@ -612,17 +579,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 2) {
-    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index];
-    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 1];
-    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
-    pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 8],
-               publishPadPosition,
-               clustererNN.mOutputDataReg2_8[model_output_index + 4],
-               (clusterer.mPmemory->fragment).start + publishTimePosition,
-               clustererNN.mOutputDataReg2_8[model_output_index + 6],
-               clustererNN.mClusterFlags[2 * glo_idx],
-               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
 
   tpc::ClusterNative myCluster;
@@ -666,7 +622,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                clustererNN.mOutputDataReg2_32[model_output_index + 7],
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 1) {
+  } else {
     publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
     publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat();
     isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
@@ -677,17 +633,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
                clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 2) {
-    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_8[model_output_index + 1];
-    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_8[model_output_index + 3];
-    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
-    pc.setFull(central_charge * clustererNN.mOutputDataReg2_8[model_output_index + 9],
-               publishPadPosition,
-               clustererNN.mOutputDataReg2_8[model_output_index + 5],
-               (clusterer.mPmemory->fragment).start + publishTimePosition,
-               clustererNN.mOutputDataReg2_8[model_output_index + 7],
-               clustererNN.mClusterFlags[2 * glo_idx],
-               clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
 
   rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);

From 039f813e4bd5c287da5fab8fa328e1b189547e08 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 30 Jun 2026 13:13:01 +0200
Subject: [PATCH 3/8] Bug-fix

---
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx     | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index f4f52d7ec3298..96b8a1d7ed2fd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -126,21 +126,13 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   // Define the datatype for input and output
   if (settings.nnInferenceInputDType.find("32") != std::string::npos) {
     clustererNN.mNnInferenceInputDType = 0;
-  } else if (settings.nnInferenceInputDType.find("16") != std::string::npos) {
-    clustererNN.mNnInferenceInputDType = 1;
-  } else if (settings.nnInferenceInputDType.find("8") != std::string::npos) {
-    clustererNN.mNnInferenceInputDType = 2;
   } else {
-    clustererNN.mNnInferenceInputDType = 1; // Default to float32
+    clustererNN.mNnInferenceInputDType = 1; // Default to float16
   }
   if (settings.nnInferenceOutputDType.find("32") != std::string::npos) {
     clustererNN.mNnInferenceOutputDType = 0;
-  } else if (settings.nnInferenceOutputDType.find("16") != std::string::npos) {
-    clustererNN.mNnInferenceOutputDType = 1;
-  } else if (settings.nnInferenceOutputDType.find("8") != std::string::npos) {
-    clustererNN.mNnInferenceOutputDType = 2;
   } else {
-    clustererNN.mNnInferenceOutputDType = 1; // Default to float32
+    clustererNN.mNnInferenceOutputDType = 1; // Default to float16
   }
   clustererNN.mNnClusterizerModelClassNumOutputNodes = mModelClass.getNumOutputNodes()[0][1];
   if (!settings.nnClusterizerUseCfRegression) {

From 1023771d2baa3c71ffa8ff768caa8010697b0204 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Tue, 30 Jun 2026 13:01:18 +0000
Subject: [PATCH 4/8] Please consider the following formatting changes

---
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 23847fad18c9f..7ad12ef90f253 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -631,7 +631,8 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
     }                                                 \
   }
 
-void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId) {
+void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions, int32_t stream, int32_t* deviceId)
+{
   GPUChkErr(cudaGetDevice(deviceId));
 
 #if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD)
@@ -645,17 +646,17 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions
   const char* keys[] = {"device_id", "trt_int8_enable"};
   const char* values[] = {device.c_str(), "1"};
 
-  ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions,keys,values,sizeof(keys) / sizeof(keys[0])));
-  ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions,"user_compute_stream",mInternals->Streams[stream]));
-  ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions,trtOptions)); // Register TensorRT first: it consequently has higher priority.
+  ORTCHK(api->UpdateTensorRTProviderOptions(trtOptions, keys, values, sizeof(keys) / sizeof(keys[0])));
+  ORTCHK(api->UpdateTensorRTProviderOptionsWithValue(trtOptions, "user_compute_stream", mInternals->Streams[stream]));
+  ORTCHK(api->SessionOptionsAppendExecutionProvider_TensorRT_V2(sessionOptions, trtOptions)); // Register TensorRT first: it consequently has higher priority.
   api->ReleaseTensorRTProviderOptions(trtOptions);
 #endif
 
   // CUDA is the fallback for nodes unsupported by TensorRT.
   OrtCUDAProviderOptionsV2* cudaOptions = nullptr;
   ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions));
-  ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions,"user_compute_stream",mInternals->Streams[stream]));
-  ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions,cudaOptions));
+  ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions, "user_compute_stream", mInternals->Streams[stream]));
+  ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions, cudaOptions));
   api->ReleaseCUDAProviderOptions(cudaOptions);
 
 #elif defined(ORT_ROCM_BUILD)

From 599cd6718c7629131c874201fed306833bfe293c Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 2 Jul 2026 10:26:56 +0200
Subject: [PATCH 5/8] Changing memory layout usage

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx       | 3 +--
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.h         | 8 ++------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 882a1e684d746..9d114b40997e6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -17,7 +17,6 @@
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUConstantMem.h"
 #include "GPUTPCClusterFinder.h"
-#include "GPUTPCCFClusterizer.h"
 #include "GPUTPCGeometry.h"
 
 using namespace o2::gpu;
@@ -48,7 +47,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
   int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? (clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] > 0) : 1);
-  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast<GPUTPCCFClusterizer::GPUSharedMemory&>(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
+  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
 }
 
 template <>
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index c77a99bec3a70..15258dbacb115 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -18,6 +18,7 @@
 #include "CfArray2D.h"
 #include "GPUGeneralKernels.h"
 #include "GPUTPCNNClusterizer.h"
+#include "GPUTPCCFClusterizer.h"
 
 namespace o2::tpc
 {
@@ -37,12 +38,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
  public:
   // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below
   static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer);
-  struct GPUSharedMemory {
-    // Regular cluster finder
-    CfChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
-    PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
-    uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
-  };
+  using GPUSharedMemory = GPUTPCCFClusterizer::GPUSharedMemory;
 
   GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep()
   {

From 37cc87f4aa9850e02331cefa91c254dcf476cb48 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 2 Jul 2026 13:02:19 +0200
Subject: [PATCH 6/8] Adjusting launch bounds

---
 GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv        | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 ++
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h   | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv
index 97761a86b966b..5a8c7cf4ddaec 100644
--- a/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv
+++ b/GPU/GPUTracking/Definitions/Parameters/GPUParameters.csv
@@ -76,7 +76,7 @@ GPUMemClean16,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,,,,
 GPUitoa,"[""GPUCA_THREAD_COUNT_DEFAULT"", 1]",,,,,,,,,,,,,,,
 GPUTPCCFNoiseSuppression_noiseSuppression,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,,,,448
 GPUTPCCFNoiseSuppression_updatePeaks,"""GPUCA_LB_GPUTPCCFNoiseSuppression""",,,,,,,,,,,,,,,448
-GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,,
+GPUTPCNNClusterizerKernels_runCfClusterizer,"""GPUCA_LB_GPUTPCCFClusterizer""",,,,,,,,,,,,,,,
 GPUTPCNNClusterizerKernels_fillInputNNCPU,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,,
 GPUTPCNNClusterizerKernels_fillInputNNGPU,1024,,,,,,,,,,,,,,,
 GPUTPCNNClusterizerKernels_determineClass1Labels,"""GPUCA_LB_GPUTPCNNClusterizerKernels""",,,,,,,,,,,,,,,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 9d114b40997e6..693ee4dd78e8d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -36,6 +36,8 @@ using namespace o2::gpu::tpccf;
 #include "GPUTPCCFClusterizer.inc"
 #endif
 
+static_assert(GPUTPCNNClusterizerKernels::SCRATCH_PAD_WORK_GROUP_SIZE == GPUTPCCFClusterizer::SCRATCH_PAD_WORK_GROUP_SIZE, "Work group sizes do not match");
+
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 15258dbacb115..d7194d163f341 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -37,7 +37,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
 {
  public:
   // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below
-  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer);
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer);
   using GPUSharedMemory = GPUTPCCFClusterizer::GPUSharedMemory;
 
   GPUhdi() constexpr static gpudatatypes::RecoStep GetRecoStep()

From a9b7adcd94c909040bfa3a3c672c28714fb43832 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 2 Jul 2026 13:07:34 +0200
Subject: [PATCH 7/8] Adding back some helpful comments

---
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 7ad12ef90f253..5c719a50638e6 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -655,14 +655,20 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions
   // CUDA is the fallback for nodes unsupported by TensorRT.
   OrtCUDAProviderOptionsV2* cudaOptions = nullptr;
   ORTCHK(api->CreateCUDAProviderOptions(&cudaOptions));
+  // std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
+  // std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
+  // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
   ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cudaOptions, "user_compute_stream", mInternals->Streams[stream]));
   ORTCHK(api->SessionOptionsAppendExecutionProvider_CUDA_V2(sessionOptions, cudaOptions));
   api->ReleaseCUDAProviderOptions(cudaOptions);
 
 #elif defined(ORT_ROCM_BUILD)
+  // const auto& api = Ort::GetApi();
+  // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocmOptions;
-  rocmOptions.has_user_compute_stream = 1;
-  rocmOptions.arena_extend_strategy = 0;
+  rocmOptions.has_user_compute_stream = 1; // Indicate that we are passing a user stream
+  rocmOptions.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
+  // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
   rocmOptions.user_compute_stream = mInternals->Streams[stream];
   sessionOptions.AppendExecutionProvider_ROCM(rocmOptions);
 #endif

From 9de8fdc40310ba7cae085399a97d7275c2f6d7df Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Thu, 2 Jul 2026 11:11:26 +0000
Subject: [PATCH 8/8] Please consider the following formatting changes

---
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 5c719a50638e6..8628abb1c0374 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -667,7 +667,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& sessionOptions
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocmOptions;
   rocmOptions.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  rocmOptions.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
+  rocmOptions.arena_extend_strategy = 0;   // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
   // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
   rocmOptions.user_compute_stream = mInternals->Streams[stream];
   sessionOptions.AppendExecutionProvider_ROCM(rocmOptions);