Skip to content

Commit 8d25116

Browse files
kpoosarodrigovivi
authored andcommitted
drm/xe/hwmon: Expose GPU PCIe temperature
Expose GPU PCIe average temperature and its limits via hwmon sysfs entry temp5_xxx. Update Xe hwmon sysfs documentation for this. v2: Update kernel version in Xe hwmon documentation. (Raag) v3: - Address review comments from Raag. - Remove redundant debug log. - Update kernel version in Xe hwmon documentation. (Raag) v4: - Address review comments from Raag. - Group new temperature attributes with existing temperature attributes as per channel index in Xe hwmon documentation. - Use TEMP_MASK instead of TEMP_MASK_MAILBOX. - Add PCIE_SENSOR_MASK which uses REG_FIELD_GET as replacement of PCIE_SENSOR_SHIFT. v5: - Address review comments from Raag. - Use REG_FIELD_GET to get PCIe temperature. - Move PCIE_SENSOR_GROUP_ID and PCIE_SENSOR_MASK to xe_pcode_api.h - Cosmetic change. Signed-off-by: Karthik Poosa <karthik.poosa@intel.com> Reviewed-by: Raag Jadav <raag.jadav@intel.com> Link: https://patch.msgid.link/20260112203521.1014388-4-karthik.poosa@intel.com Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
1 parent 3a0cb88 commit 8d25116

3 files changed

Lines changed: 58 additions & 0 deletions

File tree

Documentation/ABI/testing/sysfs-driver-intel-xe-hwmon

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,30 @@ Description: RO. Memory controller average temperature in millidegree Celsius.
189189

190190
Only supported for particular Intel Xe graphics platforms.
191191

192+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp5_crit
193+
Date: January 2026
194+
KernelVersion: 7.0
195+
Contact: intel-xe@lists.freedesktop.org
196+
Description: RO. GPU PCIe critical temperature in millidegree Celsius.
197+
198+
Only supported for particular Intel Xe graphics platforms.
199+
200+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp5_emergency
201+
Date: January 2026
202+
KernelVersion: 7.0
203+
Contact: intel-xe@lists.freedesktop.org
204+
Description: RO. GPU PCIe shutdown temperature in millidegree Celsius.
205+
206+
Only supported for particular Intel Xe graphics platforms.
207+
208+
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/temp5_input
209+
Date: January 2026
210+
KernelVersion: 7.0
211+
Contact: intel-xe@lists.freedesktop.org
212+
Description: RO. GPU PCIe temperature in millidegree Celsius.
213+
214+
Only supported for particular Intel Xe graphics platforms.
215+
192216
What: /sys/bus/pci/drivers/xe/.../hwmon/hwmon<i>/fan1_input
193217
Date: March 2025
194218
KernelVersion: 6.16

drivers/gpu/drm/xe/xe_hwmon.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ enum xe_hwmon_channel {
4444
CHANNEL_PKG,
4545
CHANNEL_VRAM,
4646
CHANNEL_MCTRL,
47+
CHANNEL_PCIE,
4748
CHANNEL_MAX,
4849
};
4950

@@ -712,6 +713,7 @@ static const struct hwmon_channel_info * const hwmon_info[] = {
712713
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL |
713714
HWMON_T_MAX,
714715
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
716+
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL,
715717
HWMON_T_CRIT | HWMON_T_EMERGENCY | HWMON_T_INPUT | HWMON_T_LABEL),
716718
HWMON_CHANNEL_INFO(power, HWMON_P_MAX | HWMON_P_RATED_MAX | HWMON_P_LABEL | HWMON_P_CRIT |
717719
HWMON_P_CAP,
@@ -771,6 +773,27 @@ static int get_mc_temp(struct xe_hwmon *hwmon, long *val)
771773
return 0;
772774
}
773775

776+
static int get_pcie_temp(struct xe_hwmon *hwmon, long *val)
777+
{
778+
struct xe_tile *root_tile = xe_device_get_root_tile(hwmon->xe);
779+
u32 data = 0;
780+
int ret;
781+
782+
ret = xe_pcode_read(root_tile, PCODE_MBOX(PCODE_THERMAL_INFO, READ_THERMAL_DATA,
783+
PCIE_SENSOR_GROUP_ID), &data, NULL);
784+
if (ret)
785+
return ret;
786+
787+
/* Sensor offset is different for G21 */
788+
if (hwmon->xe->info.subplatform != XE_SUBPLATFORM_BATTLEMAGE_G21)
789+
data = REG_FIELD_GET(PCIE_SENSOR_MASK, data);
790+
791+
data = REG_FIELD_GET(TEMP_MASK, data);
792+
*val = (s8)data * MILLIDEGREE_PER_DEGREE;
793+
794+
return 0;
795+
}
796+
774797
/* I1 is exposed as power_crit or as curr_crit depending on bit 31 */
775798
static int xe_hwmon_pcode_read_i1(const struct xe_hwmon *hwmon, u32 *uval)
776799
{
@@ -876,6 +899,7 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
876899
case CHANNEL_VRAM:
877900
return hwmon->temp.limit[TEMP_LIMIT_MEM_SHUTDOWN] ? 0444 : 0;
878901
case CHANNEL_MCTRL:
902+
case CHANNEL_PCIE:
879903
return hwmon->temp.count ? 0444 : 0;
880904
default:
881905
return 0;
@@ -887,6 +911,7 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
887911
case CHANNEL_VRAM:
888912
return hwmon->temp.limit[TEMP_LIMIT_MEM_CRIT] ? 0444 : 0;
889913
case CHANNEL_MCTRL:
914+
case CHANNEL_PCIE:
890915
return hwmon->temp.count ? 0444 : 0;
891916
default:
892917
return 0;
@@ -906,6 +931,7 @@ xe_hwmon_temp_is_visible(struct xe_hwmon *hwmon, u32 attr, int channel)
906931
return xe_reg_is_valid(xe_hwmon_get_reg(hwmon, REG_TEMP,
907932
channel)) ? 0444 : 0;
908933
case CHANNEL_MCTRL:
934+
case CHANNEL_PCIE:
909935
return hwmon->temp.count ? 0444 : 0;
910936
default:
911937
return 0;
@@ -933,13 +959,16 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
933959
return 0;
934960
case CHANNEL_MCTRL:
935961
return get_mc_temp(hwmon, val);
962+
case CHANNEL_PCIE:
963+
return get_pcie_temp(hwmon, val);
936964
default:
937965
return -EOPNOTSUPP;
938966
}
939967
case hwmon_temp_emergency:
940968
switch (channel) {
941969
case CHANNEL_PKG:
942970
case CHANNEL_MCTRL:
971+
case CHANNEL_PCIE:
943972
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_SHUTDOWN] * MILLIDEGREE_PER_DEGREE;
944973
return 0;
945974
case CHANNEL_VRAM:
@@ -952,6 +981,7 @@ xe_hwmon_temp_read(struct xe_hwmon *hwmon, u32 attr, int channel, long *val)
952981
switch (channel) {
953982
case CHANNEL_PKG:
954983
case CHANNEL_MCTRL:
984+
case CHANNEL_PCIE:
955985
*val = hwmon->temp.limit[TEMP_LIMIT_PKG_CRIT] * MILLIDEGREE_PER_DEGREE;
956986
return 0;
957987
case CHANNEL_VRAM:
@@ -1331,6 +1361,8 @@ static int xe_hwmon_read_label(struct device *dev,
13311361
*str = "vram";
13321362
else if (channel == CHANNEL_MCTRL)
13331363
*str = "mctrl";
1364+
else if (channel == CHANNEL_PCIE)
1365+
*str = "pcie";
13341366
return 0;
13351367
case hwmon_power:
13361368
case hwmon_energy:

drivers/gpu/drm/xe/xe_pcode_api.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
#define READ_THERMAL_LIMITS 0x0
5555
#define READ_THERMAL_CONFIG 0x1
5656
#define READ_THERMAL_DATA 0x2
57+
#define PCIE_SENSOR_GROUP_ID 0x2
58+
#define PCIE_SENSOR_MASK REG_GENMASK(31, 16)
5759

5860
#define PCODE_LATE_BINDING 0x5C
5961
#define GET_CAPABILITY_STATUS 0x0

0 commit comments

Comments
 (0)