|
93 | 93 | #include "rw.h" |
94 | 94 | #include "alloc_cache.h" |
95 | 95 | #include "eventfd.h" |
| 96 | +#include "wait.h" |
96 | 97 |
|
97 | 98 | #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ |
98 | 99 | IOSQE_IO_HARDLINK | IOSQE_ASYNC) |
@@ -166,16 +167,6 @@ static void io_poison_req(struct io_kiocb *req) |
166 | 167 | req->link = IO_URING_PTR_POISON; |
167 | 168 | } |
168 | 169 |
|
169 | | -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) |
170 | | -{ |
171 | | - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); |
172 | | -} |
173 | | - |
174 | | -static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) |
175 | | -{ |
176 | | - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); |
177 | | -} |
178 | | - |
179 | 170 | static inline void req_fail_link_node(struct io_kiocb *req, int res) |
180 | 171 | { |
181 | 172 | req_set_fail(req); |
@@ -589,7 +580,7 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) |
589 | 580 | __io_cqring_overflow_flush(ctx, true); |
590 | 581 | } |
591 | 582 |
|
592 | | -static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) |
| 583 | +void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) |
593 | 584 | { |
594 | 585 | mutex_lock(&ctx->uring_lock); |
595 | 586 | __io_cqring_overflow_flush(ctx, false); |
@@ -1161,13 +1152,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) |
1161 | 1152 | ctx->submit_state.cq_flush = false; |
1162 | 1153 | } |
1163 | 1154 |
|
1164 | | -static unsigned io_cqring_events(struct io_ring_ctx *ctx) |
1165 | | -{ |
1166 | | - /* See comment at the top of this file */ |
1167 | | - smp_rmb(); |
1168 | | - return __io_cqring_events(ctx); |
1169 | | -} |
1170 | | - |
1171 | 1155 | /* |
1172 | 1156 | * We can't just wait for polled events to come to us, we have to actively |
1173 | 1157 | * find and complete them. |
@@ -2060,308 +2044,6 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) |
2060 | 2044 | return ret; |
2061 | 2045 | } |
2062 | 2046 |
|
2063 | | -static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, |
2064 | | - int wake_flags, void *key) |
2065 | | -{ |
2066 | | - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); |
2067 | | - |
2068 | | - /* |
2069 | | - * Cannot safely flush overflowed CQEs from here, ensure we wake up |
2070 | | - * the task, and the next invocation will do it. |
2071 | | - */ |
2072 | | - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) |
2073 | | - return autoremove_wake_function(curr, mode, wake_flags, key); |
2074 | | - return -1; |
2075 | | -} |
2076 | | - |
2077 | | -int io_run_task_work_sig(struct io_ring_ctx *ctx) |
2078 | | -{ |
2079 | | - if (io_local_work_pending(ctx)) { |
2080 | | - __set_current_state(TASK_RUNNING); |
2081 | | - if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) |
2082 | | - return 0; |
2083 | | - } |
2084 | | - if (io_run_task_work() > 0) |
2085 | | - return 0; |
2086 | | - if (task_sigpending(current)) |
2087 | | - return -EINTR; |
2088 | | - return 0; |
2089 | | -} |
2090 | | - |
2091 | | -static bool current_pending_io(void) |
2092 | | -{ |
2093 | | - struct io_uring_task *tctx = current->io_uring; |
2094 | | - |
2095 | | - if (!tctx) |
2096 | | - return false; |
2097 | | - return percpu_counter_read_positive(&tctx->inflight); |
2098 | | -} |
2099 | | - |
2100 | | -static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) |
2101 | | -{ |
2102 | | - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); |
2103 | | - |
2104 | | - WRITE_ONCE(iowq->hit_timeout, 1); |
2105 | | - iowq->min_timeout = 0; |
2106 | | - wake_up_process(iowq->wq.private); |
2107 | | - return HRTIMER_NORESTART; |
2108 | | -} |
2109 | | - |
2110 | | -/* |
2111 | | - * Doing min_timeout portion. If we saw any timeouts, events, or have work, |
2112 | | - * wake up. If not, and we have a normal timeout, switch to that and keep |
2113 | | - * sleeping. |
2114 | | - */ |
2115 | | -static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) |
2116 | | -{ |
2117 | | - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); |
2118 | | - struct io_ring_ctx *ctx = iowq->ctx; |
2119 | | - |
2120 | | - /* no general timeout, or shorter (or equal), we are done */ |
2121 | | - if (iowq->timeout == KTIME_MAX || |
2122 | | - ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) |
2123 | | - goto out_wake; |
2124 | | - /* work we may need to run, wake function will see if we need to wake */ |
2125 | | - if (io_has_work(ctx)) |
2126 | | - goto out_wake; |
2127 | | - /* got events since we started waiting, min timeout is done */ |
2128 | | - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) |
2129 | | - goto out_wake; |
2130 | | - /* if we have any events and min timeout expired, we're done */ |
2131 | | - if (io_cqring_events(ctx)) |
2132 | | - goto out_wake; |
2133 | | - |
2134 | | - /* |
2135 | | - * If using deferred task_work running and application is waiting on |
2136 | | - * more than one request, ensure we reset it now where we are switching |
2137 | | - * to normal sleeps. Any request completion post min_wait should wake |
2138 | | - * the task and return. |
2139 | | - */ |
2140 | | - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { |
2141 | | - atomic_set(&ctx->cq_wait_nr, 1); |
2142 | | - smp_mb(); |
2143 | | - if (!llist_empty(&ctx->work_llist)) |
2144 | | - goto out_wake; |
2145 | | - } |
2146 | | - |
2147 | | - /* any generated CQE posted past this time should wake us up */ |
2148 | | - iowq->cq_tail = iowq->cq_min_tail; |
2149 | | - |
2150 | | - hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); |
2151 | | - hrtimer_set_expires(timer, iowq->timeout); |
2152 | | - return HRTIMER_RESTART; |
2153 | | -out_wake: |
2154 | | - return io_cqring_timer_wakeup(timer); |
2155 | | -} |
2156 | | - |
2157 | | -static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, |
2158 | | - clockid_t clock_id, ktime_t start_time) |
2159 | | -{ |
2160 | | - ktime_t timeout; |
2161 | | - |
2162 | | - if (iowq->min_timeout) { |
2163 | | - timeout = ktime_add_ns(iowq->min_timeout, start_time); |
2164 | | - hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, |
2165 | | - HRTIMER_MODE_ABS); |
2166 | | - } else { |
2167 | | - timeout = iowq->timeout; |
2168 | | - hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, |
2169 | | - HRTIMER_MODE_ABS); |
2170 | | - } |
2171 | | - |
2172 | | - hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); |
2173 | | - hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); |
2174 | | - |
2175 | | - if (!READ_ONCE(iowq->hit_timeout)) |
2176 | | - schedule(); |
2177 | | - |
2178 | | - hrtimer_cancel(&iowq->t); |
2179 | | - destroy_hrtimer_on_stack(&iowq->t); |
2180 | | - __set_current_state(TASK_RUNNING); |
2181 | | - |
2182 | | - return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; |
2183 | | -} |
2184 | | - |
2185 | | -struct ext_arg { |
2186 | | - size_t argsz; |
2187 | | - struct timespec64 ts; |
2188 | | - const sigset_t __user *sig; |
2189 | | - ktime_t min_time; |
2190 | | - bool ts_set; |
2191 | | - bool iowait; |
2192 | | -}; |
2193 | | - |
2194 | | -static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, |
2195 | | - struct io_wait_queue *iowq, |
2196 | | - struct ext_arg *ext_arg, |
2197 | | - ktime_t start_time) |
2198 | | -{ |
2199 | | - int ret = 0; |
2200 | | - |
2201 | | - /* |
2202 | | - * Mark us as being in io_wait if we have pending requests, so cpufreq |
2203 | | - * can take into account that the task is waiting for IO - turns out |
2204 | | - * to be important for low QD IO. |
2205 | | - */ |
2206 | | - if (ext_arg->iowait && current_pending_io()) |
2207 | | - current->in_iowait = 1; |
2208 | | - if (iowq->timeout != KTIME_MAX || iowq->min_timeout) |
2209 | | - ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); |
2210 | | - else |
2211 | | - schedule(); |
2212 | | - current->in_iowait = 0; |
2213 | | - return ret; |
2214 | | -} |
2215 | | - |
2216 | | -/* If this returns > 0, the caller should retry */ |
2217 | | -static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, |
2218 | | - struct io_wait_queue *iowq, |
2219 | | - struct ext_arg *ext_arg, |
2220 | | - ktime_t start_time) |
2221 | | -{ |
2222 | | - if (unlikely(READ_ONCE(ctx->check_cq))) |
2223 | | - return 1; |
2224 | | - if (unlikely(io_local_work_pending(ctx))) |
2225 | | - return 1; |
2226 | | - if (unlikely(task_work_pending(current))) |
2227 | | - return 1; |
2228 | | - if (unlikely(task_sigpending(current))) |
2229 | | - return -EINTR; |
2230 | | - if (unlikely(io_should_wake(iowq))) |
2231 | | - return 0; |
2232 | | - |
2233 | | - return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); |
2234 | | -} |
2235 | | - |
2236 | | -/* |
2237 | | - * Wait until events become available, if we don't already have some. The |
2238 | | - * application must reap them itself, as they reside on the shared cq ring. |
2239 | | - */ |
2240 | | -static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, |
2241 | | - struct ext_arg *ext_arg) |
2242 | | -{ |
2243 | | - struct io_wait_queue iowq; |
2244 | | - struct io_rings *rings = ctx->rings; |
2245 | | - ktime_t start_time; |
2246 | | - int ret; |
2247 | | - |
2248 | | - min_events = min_t(int, min_events, ctx->cq_entries); |
2249 | | - |
2250 | | - if (!io_allowed_run_tw(ctx)) |
2251 | | - return -EEXIST; |
2252 | | - if (io_local_work_pending(ctx)) |
2253 | | - io_run_local_work(ctx, min_events, |
2254 | | - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); |
2255 | | - io_run_task_work(); |
2256 | | - |
2257 | | - if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) |
2258 | | - io_cqring_do_overflow_flush(ctx); |
2259 | | - if (__io_cqring_events_user(ctx) >= min_events) |
2260 | | - return 0; |
2261 | | - |
2262 | | - init_waitqueue_func_entry(&iowq.wq, io_wake_function); |
2263 | | - iowq.wq.private = current; |
2264 | | - INIT_LIST_HEAD(&iowq.wq.entry); |
2265 | | - iowq.ctx = ctx; |
2266 | | - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; |
2267 | | - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); |
2268 | | - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); |
2269 | | - iowq.hit_timeout = 0; |
2270 | | - iowq.min_timeout = ext_arg->min_time; |
2271 | | - iowq.timeout = KTIME_MAX; |
2272 | | - start_time = io_get_time(ctx); |
2273 | | - |
2274 | | - if (ext_arg->ts_set) { |
2275 | | - iowq.timeout = timespec64_to_ktime(ext_arg->ts); |
2276 | | - if (!(flags & IORING_ENTER_ABS_TIMER)) |
2277 | | - iowq.timeout = ktime_add(iowq.timeout, start_time); |
2278 | | - } |
2279 | | - |
2280 | | - if (ext_arg->sig) { |
2281 | | -#ifdef CONFIG_COMPAT |
2282 | | - if (in_compat_syscall()) |
2283 | | - ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, |
2284 | | - ext_arg->argsz); |
2285 | | - else |
2286 | | -#endif |
2287 | | - ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); |
2288 | | - |
2289 | | - if (ret) |
2290 | | - return ret; |
2291 | | - } |
2292 | | - |
2293 | | - io_napi_busy_loop(ctx, &iowq); |
2294 | | - |
2295 | | - trace_io_uring_cqring_wait(ctx, min_events); |
2296 | | - do { |
2297 | | - unsigned long check_cq; |
2298 | | - int nr_wait; |
2299 | | - |
2300 | | - /* if min timeout has been hit, don't reset wait count */ |
2301 | | - if (!iowq.hit_timeout) |
2302 | | - nr_wait = (int) iowq.cq_tail - |
2303 | | - READ_ONCE(ctx->rings->cq.tail); |
2304 | | - else |
2305 | | - nr_wait = 1; |
2306 | | - |
2307 | | - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { |
2308 | | - atomic_set(&ctx->cq_wait_nr, nr_wait); |
2309 | | - set_current_state(TASK_INTERRUPTIBLE); |
2310 | | - } else { |
2311 | | - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, |
2312 | | - TASK_INTERRUPTIBLE); |
2313 | | - } |
2314 | | - |
2315 | | - ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); |
2316 | | - __set_current_state(TASK_RUNNING); |
2317 | | - atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); |
2318 | | - |
2319 | | - /* |
2320 | | - * Run task_work after scheduling and before io_should_wake(). |
2321 | | - * If we got woken because of task_work being processed, run it |
2322 | | - * now rather than let the caller do another wait loop. |
2323 | | - */ |
2324 | | - if (io_local_work_pending(ctx)) |
2325 | | - io_run_local_work(ctx, nr_wait, nr_wait); |
2326 | | - io_run_task_work(); |
2327 | | - |
2328 | | - /* |
2329 | | - * Non-local task_work will be run on exit to userspace, but |
2330 | | - * if we're using DEFER_TASKRUN, then we could have waited |
2331 | | - * with a timeout for a number of requests. If the timeout |
2332 | | - * hits, we could have some requests ready to process. Ensure |
2333 | | - * this break is _after_ we have run task_work, to avoid |
2334 | | - * deferring running potentially pending requests until the |
2335 | | - * next time we wait for events. |
2336 | | - */ |
2337 | | - if (ret < 0) |
2338 | | - break; |
2339 | | - |
2340 | | - check_cq = READ_ONCE(ctx->check_cq); |
2341 | | - if (unlikely(check_cq)) { |
2342 | | - /* let the caller flush overflows, retry */ |
2343 | | - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) |
2344 | | - io_cqring_do_overflow_flush(ctx); |
2345 | | - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { |
2346 | | - ret = -EBADR; |
2347 | | - break; |
2348 | | - } |
2349 | | - } |
2350 | | - |
2351 | | - if (io_should_wake(&iowq)) { |
2352 | | - ret = 0; |
2353 | | - break; |
2354 | | - } |
2355 | | - cond_resched(); |
2356 | | - } while (1); |
2357 | | - |
2358 | | - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) |
2359 | | - finish_wait(&ctx->cq_wait, &iowq.wq); |
2360 | | - restore_saved_sigmask_unless(ret == -EINTR); |
2361 | | - |
2362 | | - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; |
2363 | | -} |
2364 | | - |
2365 | 2047 | static void io_rings_free(struct io_ring_ctx *ctx) |
2366 | 2048 | { |
2367 | 2049 | io_free_region(ctx->user, &ctx->sq_region); |
|
0 commit comments