@@ -1006,24 +1006,28 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
10061006 * fls() instead since we need to know the actual length while modifying
10071007 * goal length.
10081008 */
1009- order = fls (ac -> ac_g_ex .fe_len );
1009+ order = fls (ac -> ac_g_ex .fe_len ) - 1 ;
10101010 min_order = order - sbi -> s_mb_best_avail_max_trim_order ;
10111011 if (min_order < 0 )
10121012 min_order = 0 ;
10131013
1014- if (1 << min_order < ac -> ac_o_ex .fe_len )
1015- min_order = fls (ac -> ac_o_ex .fe_len ) + 1 ;
1016-
10171014 if (sbi -> s_stripe > 0 ) {
10181015 /*
10191016 * We are assuming that stripe size is always a multiple of
10201017 * cluster ratio otherwise __ext4_fill_super exists early.
10211018 */
10221019 num_stripe_clusters = EXT4_NUM_B2C (sbi , sbi -> s_stripe );
10231020 if (1 << min_order < num_stripe_clusters )
1024- min_order = fls (num_stripe_clusters );
1021+ /*
1022+ * We consider 1 order less because later we round
1023+ * up the goal len to num_stripe_clusters
1024+ */
1025+ min_order = fls (num_stripe_clusters ) - 1 ;
10251026 }
10261027
1028+ if (1 << min_order < ac -> ac_o_ex .fe_len )
1029+ min_order = fls (ac -> ac_o_ex .fe_len );
1030+
10271031 for (i = order ; i >= min_order ; i -- ) {
10281032 int frag_order ;
10291033 /*
@@ -4761,56 +4765,160 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
47614765 int order , i ;
47624766 struct ext4_inode_info * ei = EXT4_I (ac -> ac_inode );
47634767 struct ext4_locality_group * lg ;
4764- struct ext4_prealloc_space * tmp_pa , * cpa = NULL ;
4765- ext4_lblk_t tmp_pa_start , tmp_pa_end ;
4768+ struct ext4_prealloc_space * tmp_pa = NULL , * cpa = NULL ;
4769+ loff_t tmp_pa_end ;
47664770 struct rb_node * iter ;
47674771 ext4_fsblk_t goal_block ;
47684772
47694773 /* only data can be preallocated */
47704774 if (!(ac -> ac_flags & EXT4_MB_HINT_DATA ))
47714775 return false;
47724776
4773- /* first, try per-file preallocation */
4777+ /*
4778+ * first, try per-file preallocation by searching the inode pa rbtree.
4779+ *
4780+ * Here, we can't do a direct traversal of the tree because
4781+ * ext4_mb_discard_group_preallocation() can paralelly mark the pa
4782+ * deleted and that can cause direct traversal to skip some entries.
4783+ */
47744784 read_lock (& ei -> i_prealloc_lock );
4785+
4786+ if (RB_EMPTY_ROOT (& ei -> i_prealloc_node )) {
4787+ goto try_group_pa ;
4788+ }
4789+
4790+ /*
4791+ * Step 1: Find a pa with logical start immediately adjacent to the
4792+ * original logical start. This could be on the left or right.
4793+ *
4794+ * (tmp_pa->pa_lstart never changes so we can skip locking for it).
4795+ */
47754796 for (iter = ei -> i_prealloc_node .rb_node ; iter ;
47764797 iter = ext4_mb_pa_rb_next_iter (ac -> ac_o_ex .fe_logical ,
4777- tmp_pa_start , iter )) {
4798+ tmp_pa -> pa_lstart , iter )) {
47784799 tmp_pa = rb_entry (iter , struct ext4_prealloc_space ,
47794800 pa_node .inode_node );
4801+ }
47804802
4781- /* all fields in this condition don't change,
4782- * so we can skip locking for them */
4783- tmp_pa_start = tmp_pa -> pa_lstart ;
4784- tmp_pa_end = tmp_pa -> pa_lstart + EXT4_C2B (sbi , tmp_pa -> pa_len );
4785-
4786- /* original request start doesn't lie in this PA */
4787- if (ac -> ac_o_ex .fe_logical < tmp_pa_start ||
4788- ac -> ac_o_ex .fe_logical >= tmp_pa_end )
4789- continue ;
4803+ /*
4804+ * Step 2: The adjacent pa might be to the right of logical start, find
4805+ * the left adjacent pa. After this step we'd have a valid tmp_pa whose
4806+ * logical start is towards the left of original request's logical start
4807+ */
4808+ if (tmp_pa -> pa_lstart > ac -> ac_o_ex .fe_logical ) {
4809+ struct rb_node * tmp ;
4810+ tmp = rb_prev (& tmp_pa -> pa_node .inode_node );
47904811
4791- /* non-extent files can't have physical blocks past 2^32 */
4792- if (!( ext4_test_inode_flag ( ac -> ac_inode , EXT4_INODE_EXTENTS )) &&
4793- ( tmp_pa -> pa_pstart + EXT4_C2B ( sbi , tmp_pa -> pa_len ) >
4794- EXT4_MAX_BLOCK_FILE_PHYS )) {
4812+ if ( tmp ) {
4813+ tmp_pa = rb_entry ( tmp , struct ext4_prealloc_space ,
4814+ pa_node . inode_node );
4815+ } else {
47954816 /*
4796- * Since PAs don't overlap, we won't find any
4797- * other PA to satisfy this.
4817+ * If there is no adjacent pa to the left then finding
4818+ * an overlapping pa is not possible hence stop searching
4819+ * inode pa tree
47984820 */
4799- break ;
4821+ goto try_group_pa ;
48004822 }
4823+ }
4824+
4825+ BUG_ON (!(tmp_pa && tmp_pa -> pa_lstart <= ac -> ac_o_ex .fe_logical ));
48014826
4802- /* found preallocated blocks, use them */
4827+ /*
4828+ * Step 3: If the left adjacent pa is deleted, keep moving left to find
4829+ * the first non deleted adjacent pa. After this step we should have a
4830+ * valid tmp_pa which is guaranteed to be non deleted.
4831+ */
4832+ for (iter = & tmp_pa -> pa_node .inode_node ;; iter = rb_prev (iter )) {
4833+ if (!iter ) {
4834+ /*
4835+ * no non deleted left adjacent pa, so stop searching
4836+ * inode pa tree
4837+ */
4838+ goto try_group_pa ;
4839+ }
4840+ tmp_pa = rb_entry (iter , struct ext4_prealloc_space ,
4841+ pa_node .inode_node );
48034842 spin_lock (& tmp_pa -> pa_lock );
4804- if (tmp_pa -> pa_deleted == 0 && tmp_pa -> pa_free &&
4805- likely (ext4_mb_pa_goal_check (ac , tmp_pa ))) {
4806- atomic_inc (& tmp_pa -> pa_count );
4807- ext4_mb_use_inode_pa (ac , tmp_pa );
4843+ if (tmp_pa -> pa_deleted == 0 ) {
4844+ /*
4845+ * We will keep holding the pa_lock from
4846+ * this point on because we don't want group discard
4847+ * to delete this pa underneath us. Since group
4848+ * discard is anyways an ENOSPC operation it
4849+ * should be okay for it to wait a few more cycles.
4850+ */
4851+ break ;
4852+ } else {
48084853 spin_unlock (& tmp_pa -> pa_lock );
4809- read_unlock (& ei -> i_prealloc_lock );
4810- return true;
48114854 }
4855+ }
4856+
4857+ BUG_ON (!(tmp_pa && tmp_pa -> pa_lstart <= ac -> ac_o_ex .fe_logical ));
4858+ BUG_ON (tmp_pa -> pa_deleted == 1 );
4859+
4860+ /*
4861+ * Step 4: We now have the non deleted left adjacent pa. Only this
4862+ * pa can possibly satisfy the request hence check if it overlaps
4863+ * original logical start and stop searching if it doesn't.
4864+ */
4865+ tmp_pa_end = (loff_t )tmp_pa -> pa_lstart + EXT4_C2B (sbi , tmp_pa -> pa_len );
4866+
4867+ if (ac -> ac_o_ex .fe_logical >= tmp_pa_end ) {
48124868 spin_unlock (& tmp_pa -> pa_lock );
4869+ goto try_group_pa ;
4870+ }
4871+
4872+ /* non-extent files can't have physical blocks past 2^32 */
4873+ if (!(ext4_test_inode_flag (ac -> ac_inode , EXT4_INODE_EXTENTS )) &&
4874+ (tmp_pa -> pa_pstart + EXT4_C2B (sbi , tmp_pa -> pa_len ) >
4875+ EXT4_MAX_BLOCK_FILE_PHYS )) {
4876+ /*
4877+ * Since PAs don't overlap, we won't find any other PA to
4878+ * satisfy this.
4879+ */
4880+ spin_unlock (& tmp_pa -> pa_lock );
4881+ goto try_group_pa ;
4882+ }
4883+
4884+ if (tmp_pa -> pa_free && likely (ext4_mb_pa_goal_check (ac , tmp_pa ))) {
4885+ atomic_inc (& tmp_pa -> pa_count );
4886+ ext4_mb_use_inode_pa (ac , tmp_pa );
4887+ spin_unlock (& tmp_pa -> pa_lock );
4888+ read_unlock (& ei -> i_prealloc_lock );
4889+ return true;
4890+ } else {
4891+ /*
4892+ * We found a valid overlapping pa but couldn't use it because
4893+ * it had no free blocks. This should ideally never happen
4894+ * because:
4895+ *
4896+ * 1. When a new inode pa is added to rbtree it must have
4897+ * pa_free > 0 since otherwise we won't actually need
4898+ * preallocation.
4899+ *
4900+ * 2. An inode pa that is in the rbtree can only have it's
4901+ * pa_free become zero when another thread calls:
4902+ * ext4_mb_new_blocks
4903+ * ext4_mb_use_preallocated
4904+ * ext4_mb_use_inode_pa
4905+ *
4906+ * 3. Further, after the above calls make pa_free == 0, we will
4907+ * immediately remove it from the rbtree in:
4908+ * ext4_mb_new_blocks
4909+ * ext4_mb_release_context
4910+ * ext4_mb_put_pa
4911+ *
4912+ * 4. Since the pa_free becoming 0 and pa_free getting removed
4913+ * from tree both happen in ext4_mb_new_blocks, which is always
4914+ * called with i_data_sem held for data allocations, we can be
4915+ * sure that another process will never see a pa in rbtree with
4916+ * pa_free == 0.
4917+ */
4918+ WARN_ON_ONCE (tmp_pa -> pa_free == 0 );
48134919 }
4920+ spin_unlock (& tmp_pa -> pa_lock );
4921+ try_group_pa :
48144922 read_unlock (& ei -> i_prealloc_lock );
48154923
48164924 /* can we use group allocation? */
0 commit comments