Skip to content

Commit 6197cff

Browse files
drm/i915: Dump error capture to kernel log
This is useful for getting debug information out in certain situations, such as failing kernel selftests and CI runs that don't log error captures. It is especially useful for things like retrieving GuC logs as GuC operation can't be tracked by adding printk or ftrace entries. v2: Add CONFIG_DRM_I915_DEBUG_GEM wrapper (review feedback by Rodrigo). Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20230418181744.3251240-2-John.C.Harrison@Intel.com
1 parent 621b678 commit 6197cff

2 files changed

Lines changed: 142 additions & 0 deletions

File tree

drivers/gpu/drm/i915/i915_gpu_error.c

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2228,3 +2228,135 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err)
22282228
i915->gpu_error.first_error = ERR_PTR(err);
22292229
spin_unlock_irq(&i915->gpu_error.lock);
22302230
}
2231+
2232+
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
2233+
void intel_klog_error_capture(struct intel_gt *gt,
2234+
intel_engine_mask_t engine_mask)
2235+
{
2236+
static int g_count;
2237+
struct drm_i915_private *i915 = gt->i915;
2238+
struct i915_gpu_coredump *error;
2239+
intel_wakeref_t wakeref;
2240+
size_t buf_size = PAGE_SIZE * 128;
2241+
size_t pos_err;
2242+
char *buf, *ptr, *next;
2243+
int l_count = g_count++;
2244+
int line = 0;
2245+
2246+
/* Can't allocate memory during a reset */
2247+
if (test_bit(I915_RESET_BACKOFF, &gt->reset.flags)) {
2248+
drm_err(&gt->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n",
2249+
l_count, line++);
2250+
return;
2251+
}
2252+
2253+
error = READ_ONCE(i915->gpu_error.first_error);
2254+
if (error) {
2255+
drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n",
2256+
l_count, line++);
2257+
i915_reset_error_state(i915);
2258+
}
2259+
2260+
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
2261+
error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE);
2262+
2263+
if (IS_ERR(error)) {
2264+
drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n",
2265+
l_count, line++, PTR_ERR(error));
2266+
return;
2267+
}
2268+
2269+
buf = kvmalloc(buf_size, GFP_KERNEL);
2270+
if (!buf) {
2271+
drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n",
2272+
l_count, line++);
2273+
i915_gpu_coredump_put(error);
2274+
return;
2275+
}
2276+
2277+
drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n",
2278+
l_count, line++, __builtin_return_address(0));
2279+
2280+
/* Largest string length safe to print via dmesg */
2281+
# define MAX_CHUNK 800
2282+
2283+
pos_err = 0;
2284+
while (1) {
2285+
ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1);
2286+
2287+
if (got <= 0)
2288+
break;
2289+
2290+
buf[got] = 0;
2291+
pos_err += got;
2292+
2293+
ptr = buf;
2294+
while (got > 0) {
2295+
size_t count;
2296+
char tag[2];
2297+
2298+
next = strnchr(ptr, got, '\n');
2299+
if (next) {
2300+
count = next - ptr;
2301+
*next = 0;
2302+
tag[0] = '>';
2303+
tag[1] = '<';
2304+
} else {
2305+
count = got;
2306+
tag[0] = '}';
2307+
tag[1] = '{';
2308+
}
2309+
2310+
if (count > MAX_CHUNK) {
2311+
size_t pos;
2312+
char *ptr2 = ptr;
2313+
2314+
for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) {
2315+
char chr = ptr[pos];
2316+
2317+
ptr[pos] = 0;
2318+
drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n",
2319+
l_count, line++, ptr2);
2320+
ptr[pos] = chr;
2321+
ptr2 = ptr + pos;
2322+
2323+
/*
2324+
* If spewing large amounts of data via a serial console,
2325+
* this can be a very slow process. So be friendly and try
2326+
* not to cause 'softlockup on CPU' problems.
2327+
*/
2328+
cond_resched();
2329+
}
2330+
2331+
if (ptr2 < (ptr + count))
2332+
drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
2333+
l_count, line++, tag[0], ptr2, tag[1]);
2334+
else if (tag[0] == '>')
2335+
drm_info(&i915->drm, "[Capture/%d.%d] ><\n",
2336+
l_count, line++);
2337+
} else {
2338+
drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n",
2339+
l_count, line++, tag[0], ptr, tag[1]);
2340+
}
2341+
2342+
ptr = next;
2343+
got -= count;
2344+
if (next) {
2345+
ptr++;
2346+
got--;
2347+
}
2348+
2349+
/* As above. */
2350+
cond_resched();
2351+
}
2352+
2353+
if (got)
2354+
drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n",
2355+
l_count, line++, got);
2356+
}
2357+
2358+
kvfree(buf);
2359+
2360+
drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n", l_count, line++, pos_err);
2361+
}
2362+
#endif

drivers/gpu/drm/i915/i915_gpu_error.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,16 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
258258
#define CORE_DUMP_FLAG_NONE 0x0
259259
#define CORE_DUMP_FLAG_IS_GUC_CAPTURE BIT(0)
260260

261+
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) && IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
262+
void intel_klog_error_capture(struct intel_gt *gt,
263+
intel_engine_mask_t engine_mask);
264+
#else
265+
static inline void intel_klog_error_capture(struct intel_gt *gt,
266+
intel_engine_mask_t engine_mask)
267+
{
268+
}
269+
#endif
270+
261271
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
262272

263273
__printf(2, 3)

0 commit comments

Comments
 (0)