Branch data Line data Source code
1 : : // SPDX-License-Identifier: GPL-2.0 2 : : /* 3 : : * Functions to sequence PREFLUSH and FUA writes. 4 : : * 5 : : * Copyright (C) 2011 Max Planck Institute for Gravitational Physics 6 : : * Copyright (C) 2011 Tejun Heo <tj@kernel.org> 7 : : * 8 : : * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three 9 : : * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request 10 : : * properties and hardware capability. 11 : : * 12 : : * If a request doesn't have data, only REQ_PREFLUSH makes sense, which 13 : : * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates 14 : : * that the device cache should be flushed before the data is executed, and 15 : : * REQ_FUA means that the data must be on non-volatile media on request 16 : : * completion. 17 : : * 18 : : * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any 19 : : * difference. The requests are either completed immediately if there's no data 20 : : * or executed as normal requests otherwise. 21 : : * 22 : : * If the device has writeback cache and supports FUA, REQ_PREFLUSH is 23 : : * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. 24 : : * 25 : : * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH 26 : : * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. 27 : : * 28 : : * The actual execution of flush is double buffered. Whenever a request 29 : : * needs to execute PRE or POSTFLUSH, it queues at 30 : : * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a 31 : : * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush 32 : : * completes, all the requests which were pending are proceeded to the next 33 : : * step. This allows arbitrary merging of different types of PREFLUSH/FUA 34 : : * requests. 35 : : * 36 : : * Currently, the following conditions are used to determine when to issue 37 : : * flush. 38 : : * 39 : : * C1. At any given time, only one flush shall be in progress. This makes 40 : : * double buffering sufficient. 41 : : * 42 : : * C2. Flush is deferred if any request is executing DATA of its sequence. 43 : : * This avoids issuing separate POSTFLUSHes for requests which shared 44 : : * PREFLUSH. 45 : : * 46 : : * C3. The second condition is ignored if there is a request which has 47 : : * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid 48 : : * starvation in the unlikely case where there are continuous stream of 49 : : * FUA (without PREFLUSH) requests. 50 : : * 51 : : * For devices which support FUA, it isn't clear whether C2 (and thus C3) 52 : : * is beneficial. 53 : : * 54 : : * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. 55 : : * Once while executing DATA and again after the whole sequence is 56 : : * complete. The first completion updates the contained bio but doesn't 57 : : * finish it so that the bio submitter is notified only after the whole 58 : : * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in 59 : : * req_bio_endio(). 60 : : * 61 : : * The above peculiarity requires that each PREFLUSH/FUA request has only one 62 : : * bio attached to it, which is guaranteed as they aren't allowed to be 63 : : * merged in the usual way. 64 : : */ 65 : : 66 : : #include <linux/kernel.h> 67 : : #include <linux/module.h> 68 : : #include <linux/bio.h> 69 : : #include <linux/blkdev.h> 70 : : #include <linux/gfp.h> 71 : : #include <linux/blk-mq.h> 72 : : #include <linux/lockdep.h> 73 : : 74 : : #include "blk.h" 75 : : #include "blk-mq.h" 76 : : #include "blk-mq-tag.h" 77 : : #include "blk-mq-sched.h" 78 : : 79 : : /* PREFLUSH/FUA sequences */ 80 : : enum { 81 : : REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ 82 : : REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ 83 : : REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ 84 : : REQ_FSEQ_DONE = (1 << 3), 85 : : 86 : : REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | 87 : : REQ_FSEQ_POSTFLUSH, 88 : : 89 : : /* 90 : : * If flush has been pending longer than the following timeout, 91 : : * it's issued even if flush_data requests are still in flight. 92 : : */ 93 : : FLUSH_PENDING_TIMEOUT = 5 * HZ, 94 : : }; 95 : : 96 : : static void blk_kick_flush(struct request_queue *q, 97 : : struct blk_flush_queue *fq, unsigned int flags); 98 : : 99 : 0 : static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) 100 : : { 101 : : unsigned int policy = 0; 102 : : 103 : 0 : if (blk_rq_sectors(rq)) 104 : : policy |= REQ_FSEQ_DATA; 105 : : 106 : 0 : if (fflags & (1UL << QUEUE_FLAG_WC)) { 107 : 0 : if (rq->cmd_flags & REQ_PREFLUSH) 108 : 0 : policy |= REQ_FSEQ_PREFLUSH; 109 : 0 : if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && 110 : 0 : (rq->cmd_flags & REQ_FUA)) 111 : 0 : policy |= REQ_FSEQ_POSTFLUSH; 112 : : } 113 : 0 : return policy; 114 : : } 115 : : 116 : : static unsigned int blk_flush_cur_seq(struct request *rq) 117 : : { 118 : 0 : return 1 << ffz(rq->flush.seq); 119 : : } 120 : : 121 : : static void blk_flush_restore_request(struct request *rq) 122 : : { 123 : : /* 124 : : * After flush data completion, @rq->bio is %NULL but we need to 125 : : * complete the bio again. @rq->biotail is guaranteed to equal the 126 : : * original @rq->bio. Restore it. 127 : : */ 128 : 0 : rq->bio = rq->biotail; 129 : : 130 : : /* make @rq a normal request */ 131 : 0 : rq->rq_flags &= ~RQF_FLUSH_SEQ; 132 : 0 : rq->end_io = rq->flush.saved_end_io; 133 : : } 134 : : 135 : : static void blk_flush_queue_rq(struct request *rq, bool add_front) 136 : : { 137 : 0 : blk_mq_add_to_requeue_list(rq, add_front, true); 138 : : } 139 : : 140 : : /** 141 : : * blk_flush_complete_seq - complete flush sequence 142 : : * @rq: PREFLUSH/FUA request being sequenced 143 : : * @fq: flush queue 144 : : * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) 145 : : * @error: whether an error occurred 146 : : * 147 : : * @rq just completed @seq part of its flush sequence, record the 148 : : * completion and trigger the next step. 149 : : * 150 : : * CONTEXT: 151 : : * spin_lock_irq(fq->mq_flush_lock) 152 : : * 153 : : * RETURNS: 154 : : * %true if requests were added to the dispatch queue, %false otherwise. 155 : : */ 156 : 0 : static void blk_flush_complete_seq(struct request *rq, 157 : : struct blk_flush_queue *fq, 158 : : unsigned int seq, blk_status_t error) 159 : : { 160 : 0 : struct request_queue *q = rq->q; 161 : 0 : struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 162 : : unsigned int cmd_flags; 163 : : 164 : 0 : BUG_ON(rq->flush.seq & seq); 165 : 0 : rq->flush.seq |= seq; 166 : 0 : cmd_flags = rq->cmd_flags; 167 : : 168 : 0 : if (likely(!error)) 169 : : seq = blk_flush_cur_seq(rq); 170 : : else 171 : : seq = REQ_FSEQ_DONE; 172 : : 173 : 0 : switch (seq) { 174 : : case REQ_FSEQ_PREFLUSH: 175 : : case REQ_FSEQ_POSTFLUSH: 176 : : /* queue for flush */ 177 : 0 : if (list_empty(pending)) 178 : 0 : fq->flush_pending_since = jiffies; 179 : 0 : list_move_tail(&rq->flush.list, pending); 180 : : break; 181 : : 182 : : case REQ_FSEQ_DATA: 183 : 0 : list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); 184 : : blk_flush_queue_rq(rq, true); 185 : : break; 186 : : 187 : : case REQ_FSEQ_DONE: 188 : : /* 189 : : * @rq was previously adjusted by blk_flush_issue() for 190 : : * flush sequencing and may already have gone through the 191 : : * flush data request completion path. Restore @rq for 192 : : * normal completion and end it. 193 : : */ 194 : 0 : BUG_ON(!list_empty(&rq->queuelist)); 195 : 0 : list_del_init(&rq->flush.list); 196 : : blk_flush_restore_request(rq); 197 : 0 : blk_mq_end_request(rq, error); 198 : 0 : break; 199 : : 200 : : default: 201 : 0 : BUG(); 202 : : } 203 : : 204 : 0 : blk_kick_flush(q, fq, cmd_flags); 205 : 0 : } 206 : : 207 : 0 : static void flush_end_io(struct request *flush_rq, blk_status_t error) 208 : : { 209 : 0 : struct request_queue *q = flush_rq->q; 210 : : struct list_head *running; 211 : : struct request *rq, *n; 212 : : unsigned long flags = 0; 213 : 0 : struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); 214 : : struct blk_mq_hw_ctx *hctx; 215 : : 216 : : /* release the tag's ownership to the req cloned from */ 217 : 0 : spin_lock_irqsave(&fq->mq_flush_lock, flags); 218 : : 219 : 0 : if (!refcount_dec_and_test(&flush_rq->ref)) { 220 : 0 : fq->rq_status = error; 221 : : spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 222 : 0 : return; 223 : : } 224 : : 225 : 0 : if (fq->rq_status != BLK_STS_OK) 226 : : error = fq->rq_status; 227 : : 228 : 0 : hctx = flush_rq->mq_hctx; 229 : 0 : if (!q->elevator) { 230 : 0 : blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); 231 : 0 : flush_rq->tag = -1; 232 : : } else { 233 : 0 : blk_mq_put_driver_tag(flush_rq); 234 : 0 : flush_rq->internal_tag = -1; 235 : : } 236 : : 237 : 0 : running = &fq->flush_queue[fq->flush_running_idx]; 238 : 0 : BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); 239 : : 240 : : /* account completion of the flush request */ 241 : 0 : fq->flush_running_idx ^= 1; 242 : : 243 : : /* and push the waiting requests to the next stage */ 244 : 0 : list_for_each_entry_safe(rq, n, running, flush.list) { 245 : : unsigned int seq = blk_flush_cur_seq(rq); 246 : : 247 : 0 : BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); 248 : 0 : blk_flush_complete_seq(rq, fq, seq, error); 249 : : } 250 : : 251 : 0 : fq->flush_queue_delayed = 0; 252 : : spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 253 : : } 254 : : 255 : : /** 256 : : * blk_kick_flush - consider issuing flush request 257 : : * @q: request_queue being kicked 258 : : * @fq: flush queue 259 : : * @flags: cmd_flags of the original request 260 : : * 261 : : * Flush related states of @q have changed, consider issuing flush request. 262 : : * Please read the comment at the top of this file for more info. 263 : : * 264 : : * CONTEXT: 265 : : * spin_lock_irq(fq->mq_flush_lock) 266 : : * 267 : : */ 268 : 0 : static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, 269 : : unsigned int flags) 270 : : { 271 : 0 : struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; 272 : : struct request *first_rq = 273 : 0 : list_first_entry(pending, struct request, flush.list); 274 : 0 : struct request *flush_rq = fq->flush_rq; 275 : : 276 : : /* C1 described at the top of this file */ 277 : 0 : if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) 278 : : return; 279 : : 280 : : /* C2 and C3 281 : : * 282 : : * For blk-mq + scheduling, we can risk having all driver tags 283 : : * assigned to empty flushes, and we deadlock if we are expecting 284 : : * other requests to make progress. Don't defer for that case. 285 : : */ 286 : 0 : if (!list_empty(&fq->flush_data_in_flight) && q->elevator && 287 : 0 : time_before(jiffies, 288 : : fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) 289 : : return; 290 : : 291 : : /* 292 : : * Issue flush and toggle pending_idx. This makes pending_idx 293 : : * different from running_idx, which means flush is in flight. 294 : : */ 295 : 0 : fq->flush_pending_idx ^= 1; 296 : : 297 : 0 : blk_rq_init(q, flush_rq); 298 : : 299 : : /* 300 : : * In case of none scheduler, borrow tag from the first request 301 : : * since they can't be in flight at the same time. And acquire 302 : : * the tag's ownership for flush req. 303 : : * 304 : : * In case of IO scheduler, flush rq need to borrow scheduler tag 305 : : * just for cheating put/get driver tag. 306 : : */ 307 : 0 : flush_rq->mq_ctx = first_rq->mq_ctx; 308 : 0 : flush_rq->mq_hctx = first_rq->mq_hctx; 309 : : 310 : 0 : if (!q->elevator) { 311 : 0 : fq->orig_rq = first_rq; 312 : 0 : flush_rq->tag = first_rq->tag; 313 : 0 : blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); 314 : : } else { 315 : 0 : flush_rq->internal_tag = first_rq->internal_tag; 316 : : } 317 : : 318 : 0 : flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; 319 : 0 : flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); 320 : 0 : flush_rq->rq_flags |= RQF_FLUSH_SEQ; 321 : 0 : flush_rq->rq_disk = first_rq->rq_disk; 322 : 0 : flush_rq->end_io = flush_end_io; 323 : : 324 : : blk_flush_queue_rq(flush_rq, false); 325 : : } 326 : : 327 : 0 : static void mq_flush_data_end_io(struct request *rq, blk_status_t error) 328 : : { 329 : 0 : struct request_queue *q = rq->q; 330 : 0 : struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 331 : 0 : struct blk_mq_ctx *ctx = rq->mq_ctx; 332 : : unsigned long flags; 333 : : struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); 334 : : 335 : 0 : if (q->elevator) { 336 : 0 : WARN_ON(rq->tag < 0); 337 : 0 : blk_mq_put_driver_tag(rq); 338 : : } 339 : : 340 : : /* 341 : : * After populating an empty queue, kick it to avoid stall. Read 342 : : * the comment in flush_end_io(). 343 : : */ 344 : 0 : spin_lock_irqsave(&fq->mq_flush_lock, flags); 345 : 0 : blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); 346 : : spin_unlock_irqrestore(&fq->mq_flush_lock, flags); 347 : : 348 : 0 : blk_mq_sched_restart(hctx); 349 : 0 : } 350 : : 351 : : /** 352 : : * blk_insert_flush - insert a new PREFLUSH/FUA request 353 : : * @rq: request to insert 354 : : * 355 : : * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 356 : : * or __blk_mq_run_hw_queue() to dispatch request. 357 : : * @rq is being submitted. Analyze what needs to be done and put it on the 358 : : * right queue. 359 : : */ 360 : 0 : void blk_insert_flush(struct request *rq) 361 : : { 362 : 0 : struct request_queue *q = rq->q; 363 : 0 : unsigned long fflags = q->queue_flags; /* may change, cache */ 364 : 0 : unsigned int policy = blk_flush_policy(fflags, rq); 365 : 0 : struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); 366 : : 367 : : /* 368 : : * @policy now records what operations need to be done. Adjust 369 : : * REQ_PREFLUSH and FUA for the driver. 370 : : */ 371 : 0 : rq->cmd_flags &= ~REQ_PREFLUSH; 372 : 0 : if (!(fflags & (1UL << QUEUE_FLAG_FUA))) 373 : 0 : rq->cmd_flags &= ~REQ_FUA; 374 : : 375 : : /* 376 : : * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any 377 : : * of those flags, we have to set REQ_SYNC to avoid skewing 378 : : * the request accounting. 379 : : */ 380 : 0 : rq->cmd_flags |= REQ_SYNC; 381 : : 382 : : /* 383 : : * An empty flush handed down from a stacking driver may 384 : : * translate into nothing if the underlying device does not 385 : : * advertise a write-back cache. In this case, simply 386 : : * complete the request. 387 : : */ 388 : 0 : if (!policy) { 389 : 0 : blk_mq_end_request(rq, 0); 390 : 0 : return; 391 : : } 392 : : 393 : 0 : BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ 394 : : 395 : : /* 396 : : * If there's data but flush is not necessary, the request can be 397 : : * processed directly without going through flush machinery. Queue 398 : : * for normal execution. 399 : : */ 400 : 0 : if ((policy & REQ_FSEQ_DATA) && 401 : : !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 402 : 0 : blk_mq_request_bypass_insert(rq, false, false); 403 : 0 : return; 404 : : } 405 : : 406 : : /* 407 : : * @rq should go through flush machinery. Mark it part of flush 408 : : * sequence and submit for further processing. 409 : : */ 410 : 0 : memset(&rq->flush, 0, sizeof(rq->flush)); 411 : 0 : INIT_LIST_HEAD(&rq->flush.list); 412 : 0 : rq->rq_flags |= RQF_FLUSH_SEQ; 413 : 0 : rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 414 : : 415 : 0 : rq->end_io = mq_flush_data_end_io; 416 : : 417 : : spin_lock_irq(&fq->mq_flush_lock); 418 : 0 : blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); 419 : : spin_unlock_irq(&fq->mq_flush_lock); 420 : : } 421 : : 422 : : /** 423 : : * blkdev_issue_flush - queue a flush 424 : : * @bdev: blockdev to issue flush for 425 : : * @gfp_mask: memory allocation flags (for bio_alloc) 426 : : * @error_sector: error sector 427 : : * 428 : : * Description: 429 : : * Issue a flush for the block device in question. Caller can supply 430 : : * room for storing the error offset in case of a flush error, if they 431 : : * wish to. 432 : : */ 433 : 3 : int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, 434 : : sector_t *error_sector) 435 : : { 436 : : struct request_queue *q; 437 : : struct bio *bio; 438 : : int ret = 0; 439 : : 440 : 3 : if (bdev->bd_disk == NULL) 441 : : return -ENXIO; 442 : : 443 : : q = bdev_get_queue(bdev); 444 : 3 : if (!q) 445 : : return -ENXIO; 446 : : 447 : : /* 448 : : * some block devices may not have their queue correctly set up here 449 : : * (e.g. loop device without a backing file) and so issuing a flush 450 : : * here will panic. Ensure there is a request function before issuing 451 : : * the flush. 452 : : */ 453 : 3 : if (!q->make_request_fn) 454 : : return -ENXIO; 455 : : 456 : : bio = bio_alloc(gfp_mask, 0); 457 : 3 : bio_set_dev(bio, bdev); 458 : 3 : bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 459 : : 460 : 3 : ret = submit_bio_wait(bio); 461 : : 462 : : /* 463 : : * The driver must store the error location in ->bi_sector, if 464 : : * it supports it. For non-stacked drivers, this should be 465 : : * copied from blk_rq_pos(rq). 466 : : */ 467 : 3 : if (error_sector) 468 : 0 : *error_sector = bio->bi_iter.bi_sector; 469 : : 470 : 3 : bio_put(bio); 471 : 3 : return ret; 472 : : } 473 : : EXPORT_SYMBOL(blkdev_issue_flush); 474 : : 475 : 3 : struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, 476 : : int node, int cmd_size, gfp_t flags) 477 : : { 478 : : struct blk_flush_queue *fq; 479 : : int rq_sz = sizeof(struct request); 480 : : 481 : 3 : fq = kzalloc_node(sizeof(*fq), flags, node); 482 : 3 : if (!fq) 483 : : goto fail; 484 : : 485 : 3 : spin_lock_init(&fq->mq_flush_lock); 486 : : 487 : 3 : rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); 488 : 3 : fq->flush_rq = kzalloc_node(rq_sz, flags, node); 489 : 3 : if (!fq->flush_rq) 490 : : goto fail_rq; 491 : : 492 : 3 : INIT_LIST_HEAD(&fq->flush_queue[0]); 493 : 3 : INIT_LIST_HEAD(&fq->flush_queue[1]); 494 : 3 : INIT_LIST_HEAD(&fq->flush_data_in_flight); 495 : : 496 : : lockdep_register_key(&fq->key); 497 : : lockdep_set_class(&fq->mq_flush_lock, &fq->key); 498 : : 499 : 3 : return fq; 500 : : 501 : : fail_rq: 502 : 0 : kfree(fq); 503 : : fail: 504 : : return NULL; 505 : : } 506 : : 507 : 0 : void blk_free_flush_queue(struct blk_flush_queue *fq) 508 : : { 509 : : /* bio based request queue hasn't flush queue */ 510 : 0 : if (!fq) 511 : 0 : return; 512 : : 513 : : lockdep_unregister_key(&fq->key); 514 : 0 : kfree(fq->flush_rq); 515 : 0 : kfree(fq); 516 : : }