@@ -452,10 +452,18 @@ xfs_buf_item_format(
452452 * This is called to pin the buffer associated with the buf log item in memory
453453 * so it cannot be written out.
454454 *
455- * We also always take a reference to the buffer log item here so that the bli
456- * is held while the item is pinned in memory. This means that we can
457- * unconditionally drop the reference count a transaction holds when the
458- * transaction is completed.
455+ * We take a reference to the buffer log item here so that the BLI life cycle
456+ * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
457+ * inserted into the AIL.
458+ *
459+ * We also need to take a reference to the buffer itself as the BLI unpin
460+ * processing requires accessing the buffer after the BLI has dropped the final
461+ * BLI reference. See xfs_buf_item_unpin() for an explanation.
462+ * If unpins race to drop the final BLI reference and only the
463+ * BLI owns a reference to the buffer, then the loser of the race can have the
464+ * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
465+ * pin count ensures the life cycle of the buffer extends for as
466+ * long as we hold the buffer pin reference in xfs_buf_item_unpin().
459467 */
460468STATIC void
461469xfs_buf_item_pin (
@@ -470,13 +478,30 @@ xfs_buf_item_pin(
470478
471479 trace_xfs_buf_item_pin (bip );
472480
481+ xfs_buf_hold (bip -> bli_buf );
473482 atomic_inc (& bip -> bli_refcount );
474483 atomic_inc (& bip -> bli_buf -> b_pin_count );
475484}
476485
477486/*
478- * This is called to unpin the buffer associated with the buf log item which
479- * was previously pinned with a call to xfs_buf_item_pin().
487+ * This is called to unpin the buffer associated with the buf log item which was
488+ * previously pinned with a call to xfs_buf_item_pin(). We enter this function
489+ * with a buffer pin count, a buffer reference and a BLI reference.
490+ *
491+ * We must drop the BLI reference before we unpin the buffer because the AIL
492+ * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
493+ * refcount drops to zero, the bli could still be AIL resident and the buffer
494+ * submitted for I/O at any point before we return. This can result in IO
495+ * completion freeing the buffer while we are still trying to access it here.
496+ * This race condition can also occur in shutdown situations where we abort and
497+ * unpin buffers from contexts other that journal IO completion.
498+ *
499+ * Hence we have to hold a buffer reference per pin count to ensure that the
500+ * buffer cannot be freed until we have finished processing the unpin operation.
501+ * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
502+ * are done processing the buffer state. In the case of an abort (remove =
503+ * true) then we re-use the current pin reference as the IO reference we hand
504+ * off to IO failure handling.
480505 */
481506STATIC void
482507xfs_buf_item_unpin (
@@ -493,24 +518,18 @@ xfs_buf_item_unpin(
493518
494519 trace_xfs_buf_item_unpin (bip );
495520
496- /*
497- * Drop the bli ref associated with the pin and grab the hold required
498- * for the I/O simulation failure in the abort case. We have to do this
499- * before the pin count drops because the AIL doesn't acquire a bli
500- * reference. Therefore if the refcount drops to zero, the bli could
501- * still be AIL resident and the buffer submitted for I/O (and freed on
502- * completion) at any point before we return. This can be removed once
503- * the AIL properly holds a reference on the bli.
504- */
505521 freed = atomic_dec_and_test (& bip -> bli_refcount );
506- if (freed && !stale && remove )
507- xfs_buf_hold (bp );
508522 if (atomic_dec_and_test (& bp -> b_pin_count ))
509523 wake_up_all (& bp -> b_waiters );
510524
511- /* nothing to do but drop the pin count if the bli is active */
512- if (!freed )
525+ /*
526+ * Nothing to do but drop the buffer pin reference if the BLI is
527+ * still active.
528+ */
529+ if (!freed ) {
530+ xfs_buf_rele (bp );
513531 return ;
532+ }
514533
515534 if (stale ) {
516535 ASSERT (bip -> bli_flags & XFS_BLI_STALE );
@@ -522,6 +541,15 @@ xfs_buf_item_unpin(
522541
523542 trace_xfs_buf_item_unpin_stale (bip );
524543
544+ /*
545+ * The buffer has been locked and referenced since it was marked
546+ * stale so we own both lock and reference exclusively here. We
547+ * do not need the pin reference any more, so drop it now so
548+ * that we only have one reference to drop once item completion
549+ * processing is complete.
550+ */
551+ xfs_buf_rele (bp );
552+
525553 /*
526554 * If we get called here because of an IO error, we may or may
527555 * not have the item on the AIL. xfs_trans_ail_delete() will
@@ -538,16 +566,30 @@ xfs_buf_item_unpin(
538566 ASSERT (bp -> b_log_item == NULL );
539567 }
540568 xfs_buf_relse (bp );
541- } else if (remove ) {
569+ return ;
570+ }
571+
572+ if (remove ) {
542573 /*
543- * The buffer must be locked and held by the caller to simulate
544- * an async I/O failure. We acquired the hold for this case
545- * before the buffer was unpinned.
574+ * We need to simulate an async IO failures here to ensure that
575+ * the correct error completion is run on this buffer. This
576+ * requires a reference to the buffer and for the buffer to be
577+ * locked. We can safely pass ownership of the pin reference to
578+ * the IO to ensure that nothing can free the buffer while we
579+ * wait for the lock and then run the IO failure completion.
546580 */
547581 xfs_buf_lock (bp );
548582 bp -> b_flags |= XBF_ASYNC ;
549583 xfs_buf_ioend_fail (bp );
584+ return ;
550585 }
586+
587+ /*
588+ * BLI has no more active references - it will be moved to the AIL to
589+ * manage the remaining BLI/buffer life cycle. There is nothing left for
590+ * us to do here so drop the pin reference to the buffer.
591+ */
592+ xfs_buf_rele (bp );
551593}
552594
553595STATIC uint
0 commit comments