Skip to content

Commit 322617a

Browse files
Koichiro Denjonmason
authored andcommitted
NTB: ntb_transport: Add 'tx_memcpy_offload' module option
Some platforms (e.g. R-Car S4) do not gain from using a DMAC on TX path in ntb_transport and end up CPU-bound on memcpy_toio(). Add a module parameter 'tx_memcpy_offload' that moves the TX memcpy_toio() and descriptor writes to a per-QP kernel thread. It is disabled by default. This change also fixes a rare ordering hazard in ntb_tx_copy_callback(), that was observed on R-Car S4 once throughput improved with the new module parameter: the DONE flag write to the peer MW, which is WC mapped, could be observed after the DB/MSI trigger. Both operations are posted PCIe MWr (often via different OB iATUs), so WC buffering and bridges may reorder visibility. Insert dma_mb() to enforce store->load ordering and then read back hdr->flags to flush the posted write before ringing the doorbell / issuing MSI. While at it, update tx_index with WRITE_ONCE() at the earlier possible location to make ntb_transport_tx_free_entry() robust. Signed-off-by: Koichiro Den <den@valinux.co.jp> Signed-off-by: Jon Mason <jdmason@kudzu.us>
1 parent b36490b commit 322617a

1 file changed

Lines changed: 100 additions & 4 deletions

File tree

drivers/ntb/ntb_transport.c

Lines changed: 100 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,12 +54,14 @@
5454
#include <linux/errno.h>
5555
#include <linux/export.h>
5656
#include <linux/interrupt.h>
57+
#include <linux/kthread.h>
5758
#include <linux/module.h>
5859
#include <linux/pci.h>
5960
#include <linux/slab.h>
6061
#include <linux/types.h>
6162
#include <linux/uaccess.h>
6263
#include <linux/mutex.h>
64+
#include <linux/wait.h>
6365
#include "linux/ntb.h"
6466
#include "linux/ntb_transport.h"
6567

@@ -100,6 +102,10 @@ module_param(use_msi, bool, 0644);
100102
MODULE_PARM_DESC(use_msi, "Use MSI interrupts instead of doorbells");
101103
#endif
102104

105+
static bool tx_memcpy_offload;
106+
module_param(tx_memcpy_offload, bool, 0644);
107+
MODULE_PARM_DESC(tx_memcpy_offload, "Offload TX memcpy_toio() to a kernel thread");
108+
103109
static struct dentry *nt_debugfs_dir;
104110

105111
/* Only two-ports NTB devices are supported */
@@ -148,7 +154,9 @@ struct ntb_transport_qp {
148154
void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
149155
void *data, int len);
150156
struct list_head tx_free_q;
157+
struct list_head tx_offl_q;
151158
spinlock_t ntb_tx_free_q_lock;
159+
spinlock_t ntb_tx_offl_q_lock;
152160
void __iomem *tx_mw;
153161
phys_addr_t tx_mw_phys;
154162
size_t tx_mw_size;
@@ -199,6 +207,9 @@ struct ntb_transport_qp {
199207
int msi_irq;
200208
struct ntb_msi_desc msi_desc;
201209
struct ntb_msi_desc peer_msi_desc;
210+
211+
struct task_struct *tx_offload_thread;
212+
wait_queue_head_t tx_offload_wq;
202213
};
203214

204215
struct ntb_transport_mw {
@@ -284,7 +295,13 @@ static int ntb_async_tx_submit(struct ntb_transport_qp *qp,
284295
static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset);
285296
static int ntb_async_rx_submit(struct ntb_queue_entry *entry, void *offset);
286297
static void ntb_memcpy_rx(struct ntb_queue_entry *entry, void *offset);
298+
static int ntb_tx_memcpy_kthread(void *data);
299+
287300

301+
static inline bool ntb_tx_offload_enabled(struct ntb_transport_qp *qp)
302+
{
303+
return tx_memcpy_offload && qp && qp->tx_offload_thread;
304+
}
288305

289306
static int ntb_transport_bus_match(struct device *dev,
290307
const struct device_driver *drv)
@@ -1254,11 +1271,13 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
12541271

12551272
spin_lock_init(&qp->ntb_rx_q_lock);
12561273
spin_lock_init(&qp->ntb_tx_free_q_lock);
1274+
spin_lock_init(&qp->ntb_tx_offl_q_lock);
12571275

12581276
INIT_LIST_HEAD(&qp->rx_post_q);
12591277
INIT_LIST_HEAD(&qp->rx_pend_q);
12601278
INIT_LIST_HEAD(&qp->rx_free_q);
12611279
INIT_LIST_HEAD(&qp->tx_free_q);
1280+
INIT_LIST_HEAD(&qp->tx_offl_q);
12621281

12631282
tasklet_init(&qp->rxc_db_work, ntb_transport_rxc_db,
12641283
(unsigned long)qp);
@@ -1785,6 +1804,13 @@ static void ntb_tx_copy_callback(void *data,
17851804

17861805
iowrite32(entry->flags | DESC_DONE_FLAG, &hdr->flags);
17871806

1807+
/*
1808+
* Make DONE flag visible before DB/MSI. WC + posted MWr may reorder
1809+
* across iATU/bridge (platform-dependent). Order and flush here.
1810+
*/
1811+
dma_mb();
1812+
ioread32(&hdr->flags);
1813+
17881814
if (qp->use_msi)
17891815
ntb_msi_peer_trigger(qp->ndev, PIDX, &qp->peer_msi_desc);
17901816
else
@@ -1805,7 +1831,7 @@ static void ntb_tx_copy_callback(void *data,
18051831
ntb_list_add(&qp->ntb_tx_free_q_lock, &entry->entry, &qp->tx_free_q);
18061832
}
18071833

1808-
static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset)
1834+
static void ntb_memcpy_tx_on_stack(struct ntb_queue_entry *entry, void __iomem *offset)
18091835
{
18101836
#ifdef ARCH_HAS_NOCACHE_UACCESS
18111837
/*
@@ -1823,6 +1849,54 @@ static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset)
18231849
ntb_tx_copy_callback(entry, NULL);
18241850
}
18251851

1852+
static int ntb_tx_memcpy_kthread(void *data)
1853+
{
1854+
struct ntb_transport_qp *qp = data;
1855+
struct ntb_queue_entry *entry, *tmp;
1856+
const int resched_nr = 64;
1857+
LIST_HEAD(local_list);
1858+
void __iomem *offset;
1859+
int processed = 0;
1860+
1861+
while (!kthread_should_stop()) {
1862+
spin_lock_irq(&qp->ntb_tx_offl_q_lock);
1863+
wait_event_interruptible_lock_irq_timeout(qp->tx_offload_wq,
1864+
kthread_should_stop() ||
1865+
!list_empty(&qp->tx_offl_q),
1866+
qp->ntb_tx_offl_q_lock, 5*HZ);
1867+
list_splice_tail_init(&qp->tx_offl_q, &local_list);
1868+
spin_unlock_irq(&qp->ntb_tx_offl_q_lock);
1869+
1870+
list_for_each_entry_safe(entry, tmp, &local_list, entry) {
1871+
list_del(&entry->entry);
1872+
offset = qp->tx_mw + qp->tx_max_frame * entry->tx_index;
1873+
ntb_memcpy_tx_on_stack(entry, offset);
1874+
if (++processed >= resched_nr) {
1875+
cond_resched();
1876+
processed = 0;
1877+
}
1878+
}
1879+
cond_resched();
1880+
}
1881+
1882+
return 0;
1883+
}
1884+
1885+
static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset)
1886+
{
1887+
struct ntb_transport_qp *qp = entry->qp;
1888+
1889+
if (WARN_ON_ONCE(!qp))
1890+
return;
1891+
1892+
if (ntb_tx_offload_enabled(qp)) {
1893+
ntb_list_add(&qp->ntb_tx_offl_q_lock, &entry->entry,
1894+
&qp->tx_offl_q);
1895+
wake_up(&qp->tx_offload_wq);
1896+
} else
1897+
ntb_memcpy_tx_on_stack(entry, offset);
1898+
}
1899+
18261900
static int ntb_async_tx_submit(struct ntb_transport_qp *qp,
18271901
struct ntb_queue_entry *entry)
18281902
{
@@ -1895,6 +1969,9 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
18951969
hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header);
18961970
entry->tx_hdr = hdr;
18971971

1972+
WARN_ON_ONCE(!ntb_transport_tx_free_entry(qp));
1973+
WRITE_ONCE(qp->tx_index, (qp->tx_index + 1) % qp->tx_max_entry);
1974+
18981975
iowrite32(entry->len, &hdr->len);
18991976
iowrite32((u32)qp->tx_pkts, &hdr->ver);
19001977

@@ -1935,9 +2012,6 @@ static int ntb_process_tx(struct ntb_transport_qp *qp,
19352012

19362013
ntb_async_tx(qp, entry);
19372014

1938-
qp->tx_index++;
1939-
qp->tx_index %= qp->tx_max_entry;
1940-
19412015
qp->tx_pkts++;
19422016

19432017
return 0;
@@ -2034,6 +2108,20 @@ ntb_transport_create_queue(void *data, struct device *client_dev,
20342108
qp->tx_handler = handlers->tx_handler;
20352109
qp->event_handler = handlers->event_handler;
20362110

2111+
init_waitqueue_head(&qp->tx_offload_wq);
2112+
if (tx_memcpy_offload) {
2113+
qp->tx_offload_thread = kthread_run(ntb_tx_memcpy_kthread, qp,
2114+
"ntb-txcpy/%s/%u",
2115+
pci_name(ndev->pdev), qp->qp_num);
2116+
if (IS_ERR(qp->tx_offload_thread)) {
2117+
dev_warn(&nt->ndev->dev,
2118+
"tx memcpy offload thread creation failed: %ld; falling back to inline copy\n",
2119+
PTR_ERR(qp->tx_offload_thread));
2120+
qp->tx_offload_thread = NULL;
2121+
}
2122+
} else
2123+
qp->tx_offload_thread = NULL;
2124+
20372125
dma_cap_zero(dma_mask);
20382126
dma_cap_set(DMA_MEMCPY, dma_mask);
20392127

@@ -2141,6 +2229,11 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
21412229

21422230
qp->active = false;
21432231

2232+
if (qp->tx_offload_thread) {
2233+
kthread_stop(qp->tx_offload_thread);
2234+
qp->tx_offload_thread = NULL;
2235+
}
2236+
21442237
if (qp->tx_dma_chan) {
21452238
struct dma_chan *chan = qp->tx_dma_chan;
21462239
/* Putting the dma_chan to NULL will force any new traffic to be
@@ -2204,6 +2297,9 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
22042297
while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
22052298
kfree(entry);
22062299

2300+
while ((entry = ntb_list_rm(&qp->ntb_tx_offl_q_lock, &qp->tx_offl_q)))
2301+
kfree(entry);
2302+
22072303
qp->transport->qp_bitmap_free |= qp_bit;
22082304

22092305
dev_info(&pdev->dev, "NTB Transport QP %d freed\n", qp->qp_num);

0 commit comments

Comments
 (0)