Skip to content

Commit dbc82cd

Browse files
hoshinolinajannau
authored andcommitted
drm/asahi: Handle channel errors
Signed-off-by: Asahi Lina <lina@asahilina.net>
1 parent 864f354 commit dbc82cd

7 files changed

Lines changed: 201 additions & 14 deletions

File tree

drivers/gpu/drm/asahi/channel.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,31 @@ impl EventChannel::ver {
392392
)
393393
}
394394
},
395+
EventMsg::ChannelError {
396+
error_type,
397+
pipe_type,
398+
event_slot,
399+
event_value,
400+
} => match self.gpu.as_ref() {
401+
Some(gpu) => {
402+
let error_type = match error_type {
403+
0 => ChannelErrorType::MemoryError,
404+
1 => ChannelErrorType::DMKill,
405+
2 => ChannelErrorType::Aborted,
406+
3 => ChannelErrorType::Unk3,
407+
a => ChannelErrorType::Unknown(a),
408+
};
409+
gpu.handle_channel_error(
410+
error_type,
411+
pipe_type,
412+
event_slot,
413+
event_value,
414+
);
415+
}
416+
None => {
417+
dev_crit!(self.dev, "EventChannel: No GPU manager available!\n")
418+
}
419+
},
395420
msg => {
396421
dev_crit!(self.dev.as_ref(), "Unknown event message: {:?}\n", msg);
397422
}

drivers/gpu/drm/asahi/debug.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ pub(crate) enum DebugFlags {
6666
Debug6 = 54,
6767
Debug7 = 55,
6868

69+
VerboseFaults = 61,
6970
AllowUnknownOverrides = 62,
7071
OopsOnGpuCrash = 63,
7172
}

drivers/gpu/drm/asahi/event.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,16 @@ impl EventManager {
216216
}
217217
}
218218

219+
/// Returns a reference to the workqueue owning an event.
220+
pub(crate) fn get_owner(
221+
&self,
222+
slot: u32,
223+
) -> Option<Arc<dyn workqueue::WorkQueue + Send + Sync>> {
224+
self.alloc
225+
.with_inner(|inner| inner.owners[slot as usize].as_ref().cloned())
226+
.map(|a| a.clone())
227+
}
228+
219229
/// Fail all commands, used when the GPU crashes.
220230
pub(crate) fn fail_all(&self, error: workqueue::WorkError) {
221231
let mut owners: KVec<Arc<dyn workqueue::WorkQueue + Send + Sync>> = KVec::new();

drivers/gpu/drm/asahi/fw/channels.rs

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,16 @@ pub(crate) enum DeviceControlMsg {
184184
halt_count: U64,
185185
__pad: Pad<{ DEVICECONTROL_SZ::ver - 0x1c }>,
186186
},
187-
Unk0e(Array<DEVICECONTROL_SZ::ver, u8>),
188-
Unk0f(Array<DEVICECONTROL_SZ::ver, u8>),
187+
RecoverChannel {
188+
pipe_type: u32,
189+
work_queue: GpuWeakPointer<super::workqueue::QueueInfo::ver>,
190+
event_value: u32,
191+
__pad: Pad<{ DEVICECONTROL_SZ::ver - 0x10 }>,
192+
},
193+
IdlePowerOff {
194+
val: u32,
195+
__pad: Pad<{ DEVICECONTROL_SZ::ver - 0x4 }>,
196+
},
189197
Unk10(Array<DEVICECONTROL_SZ::ver, u8>),
190198
Unk11(Array<DEVICECONTROL_SZ::ver, u8>),
191199
Unk12(Array<DEVICECONTROL_SZ::ver, u8>),
@@ -236,6 +244,17 @@ pub(crate) struct FwCtlMsg {
236244

237245
pub(crate) const EVENT_SZ: usize = 0x34;
238246

247+
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
248+
#[repr(C, u32)]
249+
#[allow(dead_code)]
250+
pub(crate) enum ChannelErrorType {
251+
MemoryError,
252+
DMKill,
253+
Aborted,
254+
Unk3,
255+
Unknown(u32),
256+
}
257+
239258
#[derive(Debug, Copy, Clone)]
240259
#[repr(C, u32)]
241260
#[allow(dead_code)]
@@ -258,12 +277,19 @@ pub(crate) enum EventMsg {
258277
vm_slot: u32,
259278
buffer_slot: u32,
260279
counter: u32,
261-
}, // Max discriminant: 0x7
280+
},
281+
ChannelError {
282+
error_type: u32,
283+
pipe_type: u32,
284+
event_slot: u32,
285+
event_value: u32,
286+
},
287+
// Max discriminant: 0x8
262288
}
263289

264290
static_assert!(core::mem::size_of::<EventMsg>() == 4 + EVENT_SZ);
265291

266-
pub(crate) const EVENT_MAX: u32 = 0x7;
292+
pub(crate) const EVENT_MAX: u32 = 0x8;
267293

268294
#[derive(Copy, Clone)]
269295
#[repr(C)]

drivers/gpu/drm/asahi/fw/workqueue.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ pub(crate) mod raw {
135135
#[ver(V >= V13_2 && G < G14X)]
136136
pub(crate) unk_84_0: u32,
137137
pub(crate) unk_84_state: AtomicU32,
138-
pub(crate) unk_88: u32,
138+
pub(crate) error_count: AtomicU32,
139139
pub(crate) unk_8c: u32,
140140
pub(crate) unk_90: u32,
141141
pub(crate) unk_94: u32,

drivers/gpu/drm/asahi/gpu.rs

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ use kernel::{
3434
use crate::alloc::Allocator;
3535
use crate::debug::*;
3636
use crate::driver::{AsahiDevRef, AsahiDevice};
37-
use crate::fw::channels::PipeType;
37+
use crate::fw::channels::{ChannelErrorType, PipeType};
3838
use crate::fw::types::{U32, U64};
3939
use crate::{
4040
alloc, buffer, channel, event, fw, gem, hw, initdata, mem, mmu, queue, regs, workqueue,
@@ -255,6 +255,14 @@ pub(crate) trait GpuManager: Send + Sync {
255255
fn handle_timeout(&self, counter: u32, event_slot: i32, unk: u32);
256256
/// Handle a GPU fault event.
257257
fn handle_fault(&self);
258+
/// Handle a channel error event.
259+
fn handle_channel_error(
260+
&self,
261+
error_type: ChannelErrorType,
262+
pipe_type: u32,
263+
event_slot: u32,
264+
event_value: u32,
265+
);
258266
/// Acknowledge a Buffer grow op.
259267
fn ack_grow(&self, buffer_slot: u32, vm_slot: u32, counter: u32);
260268
/// Wait for the GPU to become idle and power off.
@@ -1344,6 +1352,82 @@ impl GpuManager for GpuManager::ver {
13441352
self.recover();
13451353
}
13461354

1355+
fn handle_channel_error(
1356+
&self,
1357+
error_type: ChannelErrorType,
1358+
pipe_type: u32,
1359+
event_slot: u32,
1360+
event_value: u32,
1361+
) {
1362+
dev_err!(self.dev, " (\\________/) \n");
1363+
dev_err!(self.dev, " | | \n");
1364+
dev_err!(self.dev, "'.| \\ , / |.'\n");
1365+
dev_err!(self.dev, "--| / (( \\ |--\n");
1366+
dev_err!(self.dev, ".'| _-_- |'.\n");
1367+
dev_err!(self.dev, " |________| \n");
1368+
dev_err!(self.dev, "GPU channel error nya~!!!!!\n");
1369+
dev_err!(self.dev, " Error type: {:?}\n", error_type);
1370+
dev_err!(self.dev, " Pipe type: {}\n", pipe_type);
1371+
dev_err!(self.dev, " Event slot: {}\n", event_slot);
1372+
dev_err!(self.dev, " Event value: {:#x?}\n", event_value);
1373+
1374+
self.event_manager.mark_error(
1375+
event_slot,
1376+
event_value,
1377+
workqueue::WorkError::ChannelError(error_type),
1378+
);
1379+
1380+
let wq = match self.event_manager.get_owner(event_slot) {
1381+
Some(wq) => wq,
1382+
None => {
1383+
dev_err!(self.dev, "Workqueue not found for this event slot!\n");
1384+
return;
1385+
}
1386+
};
1387+
1388+
let wq = match wq.as_any().downcast_ref::<workqueue::WorkQueue::ver>() {
1389+
Some(wq) => wq,
1390+
None => {
1391+
dev_crit!(self.dev, "GpuManager mismatched with WorkQueue!\n");
1392+
return;
1393+
}
1394+
};
1395+
1396+
if debug_enabled(DebugFlags::VerboseFaults) {
1397+
wq.dump_info();
1398+
}
1399+
1400+
let dc = fw::channels::DeviceControlMsg::ver::RecoverChannel {
1401+
pipe_type,
1402+
work_queue: wq.info_pointer(),
1403+
event_value,
1404+
__pad: Default::default(),
1405+
};
1406+
1407+
mod_dev_dbg!(self.dev, "Recover Channel command: {:?}\n", &dc);
1408+
let mut txch = self.tx_channels.lock();
1409+
1410+
let token = txch.device_control.send(&dc);
1411+
{
1412+
let mut guard = self.rtkit.lock();
1413+
let rtk = guard.as_mut().unwrap();
1414+
if rtk
1415+
.send_message(EP_DOORBELL, MSG_TX_DOORBELL | DOORBELL_DEVCTRL)
1416+
.is_err()
1417+
{
1418+
dev_err!(self.dev, "Failed to send Recover Channel command\n");
1419+
}
1420+
}
1421+
1422+
if txch.device_control.wait_for(token).is_err() {
1423+
dev_err!(self.dev, "Timed out waiting for Recover Channel command\n");
1424+
}
1425+
1426+
if debug_enabled(DebugFlags::VerboseFaults) {
1427+
wq.dump_info();
1428+
}
1429+
}
1430+
13471431
fn ack_grow(&self, buffer_slot: u32, vm_slot: u32, counter: u32) {
13481432
let halt_count = self
13491433
.initdata

drivers/gpu/drm/asahi/workqueue.rs

Lines changed: 49 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@
1414
//! up its associated event.
1515
1616
use crate::debug::*;
17-
use crate::fw::channels::PipeType;
17+
use crate::fw::channels::{ChannelErrorType, PipeType};
1818
use crate::fw::types::*;
1919
use crate::fw::workqueue::*;
2020
use crate::no_debug;
2121
use crate::object::OpaqueGpuObject;
2222
use crate::regs::FaultReason;
2323
use crate::{channel, driver, event, fw, gpu, object, regs};
24+
use core::any::Any;
2425
use core::num::NonZeroU64;
2526
use core::sync::atomic::Ordering;
2627
use kernel::{
@@ -47,6 +48,8 @@ pub(crate) enum WorkError {
4748
Fault(regs::FaultInfo),
4849
/// Work failed due to an error caused by other concurrent GPU work.
4950
Killed,
51+
/// Channel error
52+
ChannelError(ChannelErrorType),
5053
/// The GPU crashed.
5154
NoDevice,
5255
/// Unknown reason.
@@ -78,6 +81,9 @@ impl From<WorkError> for uapi::drm_asahi_result_info {
7881
status: match a {
7982
WorkError::Timeout => uapi::drm_asahi_status_DRM_ASAHI_STATUS_TIMEOUT,
8083
WorkError::Killed => uapi::drm_asahi_status_DRM_ASAHI_STATUS_KILLED,
84+
WorkError::ChannelError(_) => {
85+
uapi::drm_asahi_status_DRM_ASAHI_STATUS_CHANNEL_ERROR
86+
}
8187
WorkError::NoDevice => uapi::drm_asahi_status_DRM_ASAHI_STATUS_NO_DEVICE,
8288
_ => uapi::drm_asahi_status_DRM_ASAHI_STATUS_UNKNOWN_ERROR,
8389
},
@@ -96,6 +102,7 @@ impl From<WorkError> for kernel::error::Error {
96102
WorkError::Unknown => ENODATA,
97103
WorkError::Killed => ECANCELED,
98104
WorkError::NoDevice => ENODEV,
105+
WorkError::ChannelError(_) => EIO,
99106
}
100107
}
101108
}
@@ -600,20 +607,26 @@ impl WorkQueue::ver {
600607
size: u32,
601608
) -> Result<Arc<WorkQueue::ver>> {
602609
let gpu_buf = alloc.private.array_empty_tagged(0x2c18, b"GPBF")?;
603-
let shared = &mut alloc.shared;
610+
let mut state = alloc.shared.new_default::<RingState>()?;
611+
let ring = alloc.shared.array_empty(size as usize)?;
604612
let inner = WorkQueueInner::ver {
605613
dev: dev.into(),
606614
event_manager,
607-
info: alloc.private.new_init(
615+
// Use shared (coherent) state with verbose faults so we can dump state correctly
616+
info: if debug_enabled(DebugFlags::VerboseFaults) {
617+
&mut alloc.shared
618+
} else {
619+
&mut alloc.private
620+
}
621+
.new_init(
608622
try_init!(QueueInfo::ver {
609623
state: {
610-
let mut s = shared.new_default::<RingState>()?;
611-
s.with_mut(|raw, _inner| {
624+
state.with_mut(|raw, _inner| {
612625
raw.rb_size = size;
613626
});
614-
s
627+
state
615628
},
616-
ring: shared.array_empty(size as usize)?,
629+
ring,
617630
gpu_buf,
618631
notifier_list: notifier_list,
619632
gpu_context: gpu_context,
@@ -638,7 +651,7 @@ impl WorkQueue::ver {
638651
#[ver(V >= V13_2 && G < G14X)]
639652
unk_84_0: 0,
640653
unk_84_state: Default::default(),
641-
unk_88: 0,
654+
error_count: Default::default(),
642655
unk_8c: 0,
643656
unk_90: 0,
644657
unk_94: 0,
@@ -743,18 +756,46 @@ impl WorkQueue::ver {
743756
pub(crate) fn pipe_type(&self) -> PipeType {
744757
self.inner.lock().pipe_type
745758
}
759+
760+
pub(crate) fn dump_info(&self) {
761+
pr_info!("WorkQueue @ {:?}:", self.info_pointer);
762+
self.inner.lock().info.with(|raw, _inner| {
763+
pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr1.load(Ordering::Relaxed));
764+
pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr2.load(Ordering::Relaxed));
765+
pr_info!(" GPU rptr1: {:#x}", raw.gpu_rptr3.load(Ordering::Relaxed));
766+
pr_info!(" Event ID: {:#x}", raw.event_id.load(Ordering::Relaxed));
767+
pr_info!(" Busy: {:#x}", raw.busy.load(Ordering::Relaxed));
768+
pr_info!(" Unk 84: {:#x}", raw.unk_84_state.load(Ordering::Relaxed));
769+
pr_info!(
770+
" Error count: {:#x}",
771+
raw.error_count.load(Ordering::Relaxed)
772+
);
773+
pr_info!(" Pending: {:#x}", raw.pending.load(Ordering::Relaxed));
774+
});
775+
}
776+
777+
pub(crate) fn info_pointer(&self) -> GpuWeakPointer<QueueInfo::ver> {
778+
self.info_pointer
779+
}
746780
}
747781

748782
/// Trait used to erase the version-specific type of WorkQueues, to avoid leaking
749783
/// version-specificity into the event module.
750784
pub(crate) trait WorkQueue {
785+
/// Cast as an Any type.
786+
fn as_any(&self) -> &dyn Any;
787+
751788
fn signal(&self) -> bool;
752789
fn mark_error(&self, value: event::EventValue, error: WorkError);
753790
fn fail_all(&self, error: WorkError);
754791
}
755792

756793
#[versions(AGX)]
757794
impl WorkQueue for WorkQueue::ver {
795+
fn as_any(&self) -> &dyn Any {
796+
self
797+
}
798+
758799
/// Signal a workqueue that some work was completed.
759800
///
760801
/// This will check the event stamp value to find out exactly how many commands were processed.

0 commit comments

Comments
 (0)