5454#include <linux/errno.h>
5555#include <linux/export.h>
5656#include <linux/interrupt.h>
57+ #include <linux/kthread.h>
5758#include <linux/module.h>
5859#include <linux/pci.h>
5960#include <linux/slab.h>
6061#include <linux/types.h>
6162#include <linux/uaccess.h>
6263#include <linux/mutex.h>
64+ #include <linux/wait.h>
6365#include "linux/ntb.h"
6466#include "linux/ntb_transport.h"
6567
@@ -100,6 +102,10 @@ module_param(use_msi, bool, 0644);
100102MODULE_PARM_DESC (use_msi , "Use MSI interrupts instead of doorbells" );
101103#endif
102104
105+ static bool tx_memcpy_offload ;
106+ module_param (tx_memcpy_offload , bool , 0644 );
107+ MODULE_PARM_DESC (tx_memcpy_offload , "Offload TX memcpy_toio() to a kernel thread" );
108+
103109static struct dentry * nt_debugfs_dir ;
104110
105111/* Only two-ports NTB devices are supported */
@@ -148,7 +154,9 @@ struct ntb_transport_qp {
148154 void (* tx_handler )(struct ntb_transport_qp * qp , void * qp_data ,
149155 void * data , int len );
150156 struct list_head tx_free_q ;
157+ struct list_head tx_offl_q ;
151158 spinlock_t ntb_tx_free_q_lock ;
159+ spinlock_t ntb_tx_offl_q_lock ;
152160 void __iomem * tx_mw ;
153161 phys_addr_t tx_mw_phys ;
154162 size_t tx_mw_size ;
@@ -199,6 +207,9 @@ struct ntb_transport_qp {
199207 int msi_irq ;
200208 struct ntb_msi_desc msi_desc ;
201209 struct ntb_msi_desc peer_msi_desc ;
210+
211+ struct task_struct * tx_offload_thread ;
212+ wait_queue_head_t tx_offload_wq ;
202213};
203214
204215struct ntb_transport_mw {
@@ -284,7 +295,13 @@ static int ntb_async_tx_submit(struct ntb_transport_qp *qp,
284295static void ntb_memcpy_tx (struct ntb_queue_entry * entry , void __iomem * offset );
285296static int ntb_async_rx_submit (struct ntb_queue_entry * entry , void * offset );
286297static void ntb_memcpy_rx (struct ntb_queue_entry * entry , void * offset );
298+ static int ntb_tx_memcpy_kthread (void * data );
299+
287300
301+ static inline bool ntb_tx_offload_enabled (struct ntb_transport_qp * qp )
302+ {
303+ return tx_memcpy_offload && qp && qp -> tx_offload_thread ;
304+ }
288305
289306static int ntb_transport_bus_match (struct device * dev ,
290307 const struct device_driver * drv )
@@ -1254,11 +1271,13 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
12541271
12551272 spin_lock_init (& qp -> ntb_rx_q_lock );
12561273 spin_lock_init (& qp -> ntb_tx_free_q_lock );
1274+ spin_lock_init (& qp -> ntb_tx_offl_q_lock );
12571275
12581276 INIT_LIST_HEAD (& qp -> rx_post_q );
12591277 INIT_LIST_HEAD (& qp -> rx_pend_q );
12601278 INIT_LIST_HEAD (& qp -> rx_free_q );
12611279 INIT_LIST_HEAD (& qp -> tx_free_q );
1280+ INIT_LIST_HEAD (& qp -> tx_offl_q );
12621281
12631282 tasklet_init (& qp -> rxc_db_work , ntb_transport_rxc_db ,
12641283 (unsigned long )qp );
@@ -1785,6 +1804,13 @@ static void ntb_tx_copy_callback(void *data,
17851804
17861805 iowrite32 (entry -> flags | DESC_DONE_FLAG , & hdr -> flags );
17871806
1807+ /*
1808+ * Make DONE flag visible before DB/MSI. WC + posted MWr may reorder
1809+ * across iATU/bridge (platform-dependent). Order and flush here.
1810+ */
1811+ dma_mb ();
1812+ ioread32 (& hdr -> flags );
1813+
17881814 if (qp -> use_msi )
17891815 ntb_msi_peer_trigger (qp -> ndev , PIDX , & qp -> peer_msi_desc );
17901816 else
@@ -1805,7 +1831,7 @@ static void ntb_tx_copy_callback(void *data,
18051831 ntb_list_add (& qp -> ntb_tx_free_q_lock , & entry -> entry , & qp -> tx_free_q );
18061832}
18071833
1808- static void ntb_memcpy_tx (struct ntb_queue_entry * entry , void __iomem * offset )
1834+ static void ntb_memcpy_tx_on_stack (struct ntb_queue_entry * entry , void __iomem * offset )
18091835{
18101836#ifdef ARCH_HAS_NOCACHE_UACCESS
18111837 /*
@@ -1823,6 +1849,54 @@ static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset)
18231849 ntb_tx_copy_callback (entry , NULL );
18241850}
18251851
1852+ static int ntb_tx_memcpy_kthread (void * data )
1853+ {
1854+ struct ntb_transport_qp * qp = data ;
1855+ struct ntb_queue_entry * entry , * tmp ;
1856+ const int resched_nr = 64 ;
1857+ LIST_HEAD (local_list );
1858+ void __iomem * offset ;
1859+ int processed = 0 ;
1860+
1861+ while (!kthread_should_stop ()) {
1862+ spin_lock_irq (& qp -> ntb_tx_offl_q_lock );
1863+ wait_event_interruptible_lock_irq_timeout (qp -> tx_offload_wq ,
1864+ kthread_should_stop () ||
1865+ !list_empty (& qp -> tx_offl_q ),
1866+ qp -> ntb_tx_offl_q_lock , 5 * HZ );
1867+ list_splice_tail_init (& qp -> tx_offl_q , & local_list );
1868+ spin_unlock_irq (& qp -> ntb_tx_offl_q_lock );
1869+
1870+ list_for_each_entry_safe (entry , tmp , & local_list , entry ) {
1871+ list_del (& entry -> entry );
1872+ offset = qp -> tx_mw + qp -> tx_max_frame * entry -> tx_index ;
1873+ ntb_memcpy_tx_on_stack (entry , offset );
1874+ if (++ processed >= resched_nr ) {
1875+ cond_resched ();
1876+ processed = 0 ;
1877+ }
1878+ }
1879+ cond_resched ();
1880+ }
1881+
1882+ return 0 ;
1883+ }
1884+
1885+ static void ntb_memcpy_tx (struct ntb_queue_entry * entry , void __iomem * offset )
1886+ {
1887+ struct ntb_transport_qp * qp = entry -> qp ;
1888+
1889+ if (WARN_ON_ONCE (!qp ))
1890+ return ;
1891+
1892+ if (ntb_tx_offload_enabled (qp )) {
1893+ ntb_list_add (& qp -> ntb_tx_offl_q_lock , & entry -> entry ,
1894+ & qp -> tx_offl_q );
1895+ wake_up (& qp -> tx_offload_wq );
1896+ } else
1897+ ntb_memcpy_tx_on_stack (entry , offset );
1898+ }
1899+
18261900static int ntb_async_tx_submit (struct ntb_transport_qp * qp ,
18271901 struct ntb_queue_entry * entry )
18281902{
@@ -1895,6 +1969,9 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
18951969 hdr = offset + qp -> tx_max_frame - sizeof (struct ntb_payload_header );
18961970 entry -> tx_hdr = hdr ;
18971971
1972+ WARN_ON_ONCE (!ntb_transport_tx_free_entry (qp ));
1973+ WRITE_ONCE (qp -> tx_index , (qp -> tx_index + 1 ) % qp -> tx_max_entry );
1974+
18981975 iowrite32 (entry -> len , & hdr -> len );
18991976 iowrite32 ((u32 )qp -> tx_pkts , & hdr -> ver );
19001977
@@ -1935,9 +2012,6 @@ static int ntb_process_tx(struct ntb_transport_qp *qp,
19352012
19362013 ntb_async_tx (qp , entry );
19372014
1938- qp -> tx_index ++ ;
1939- qp -> tx_index %= qp -> tx_max_entry ;
1940-
19412015 qp -> tx_pkts ++ ;
19422016
19432017 return 0 ;
@@ -2034,6 +2108,20 @@ ntb_transport_create_queue(void *data, struct device *client_dev,
20342108 qp -> tx_handler = handlers -> tx_handler ;
20352109 qp -> event_handler = handlers -> event_handler ;
20362110
2111+ init_waitqueue_head (& qp -> tx_offload_wq );
2112+ if (tx_memcpy_offload ) {
2113+ qp -> tx_offload_thread = kthread_run (ntb_tx_memcpy_kthread , qp ,
2114+ "ntb-txcpy/%s/%u" ,
2115+ pci_name (ndev -> pdev ), qp -> qp_num );
2116+ if (IS_ERR (qp -> tx_offload_thread )) {
2117+ dev_warn (& nt -> ndev -> dev ,
2118+ "tx memcpy offload thread creation failed: %ld; falling back to inline copy\n" ,
2119+ PTR_ERR (qp -> tx_offload_thread ));
2120+ qp -> tx_offload_thread = NULL ;
2121+ }
2122+ } else
2123+ qp -> tx_offload_thread = NULL ;
2124+
20372125 dma_cap_zero (dma_mask );
20382126 dma_cap_set (DMA_MEMCPY , dma_mask );
20392127
@@ -2141,6 +2229,11 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
21412229
21422230 qp -> active = false;
21432231
2232+ if (qp -> tx_offload_thread ) {
2233+ kthread_stop (qp -> tx_offload_thread );
2234+ qp -> tx_offload_thread = NULL ;
2235+ }
2236+
21442237 if (qp -> tx_dma_chan ) {
21452238 struct dma_chan * chan = qp -> tx_dma_chan ;
21462239 /* Putting the dma_chan to NULL will force any new traffic to be
@@ -2204,6 +2297,9 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
22042297 while ((entry = ntb_list_rm (& qp -> ntb_tx_free_q_lock , & qp -> tx_free_q )))
22052298 kfree (entry );
22062299
2300+ while ((entry = ntb_list_rm (& qp -> ntb_tx_offl_q_lock , & qp -> tx_offl_q )))
2301+ kfree (entry );
2302+
22072303 qp -> transport -> qp_bitmap_free |= qp_bit ;
22082304
22092305 dev_info (& pdev -> dev , "NTB Transport QP %d freed\n" , qp -> qp_num );
0 commit comments