Skip to content

Commit 00a31dd

Browse files
npitrearndb
authored andcommitted
asm-generic/div64: optimize/simplify __div64_const32()
Several years later I just realized that this code could be greatly simplified. First, let's formalize the need for overflow handling in __arch_xprod64(). Assuming n = UINT64_MAX, there are 2 cases where an overflow may occur: 1) If a bias must be added, we have m_lo * n_lo + m or m_lo * 0xffffffff + ((m_hi << 32) + m_lo) or ((m_lo << 32) - m_lo) + ((m_hi << 32) + m_lo) or (m_lo + m_hi) << 32 which must be < (1 << 64). So the criteria for no overflow is m_lo + m_hi < (1 << 32). 2) The cross product m_lo * n_hi + m_hi * n_lo or m_lo * 0xffffffff + m_hi * 0xffffffff or ((m_lo << 32) - m_lo) + ((m_hi << 32) - m_hi). Assuming the top result from the previous step (m_lo + m_hi) that must be added to this, we get (m_lo + m_hi) << 32 again. So let's have a straight and simpler version when this is true. Otherwise some reordering allows for taking care of possible overflows without any actual conditionals. And prevent from generating both code variants by making sure this is considered only if m is perceived as constant by the compiler. This, in turn, allows for greatly simplifying __div64_const32(). The "special case" may go as well as the regular case works just fine without needing a bias. Then reduction should be applied all the time as minimizing m is the key. Signed-off-by: Nicolas Pitre <npitre@baylibre.com> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
1 parent 1dc8267 commit 00a31dd

1 file changed

Lines changed: 35 additions & 79 deletions

File tree

include/asm-generic/div64.h

Lines changed: 35 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@
7474
* do the trick here). \
7575
*/ \
7676
uint64_t ___res, ___x, ___t, ___m, ___n = (n); \
77-
uint32_t ___p, ___bias; \
77+
uint32_t ___p; \
78+
bool ___bias = false; \
7879
\
7980
/* determine MSB of b */ \
8081
___p = 1 << ilog2(___b); \
@@ -87,22 +88,14 @@
8788
___x = ~0ULL / ___b * ___b - 1; \
8889
\
8990
/* test our ___m with res = m * x / (p << 64) */ \
90-
___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32; \
91-
___t = ___res += (___m & 0xffffffff) * (___x >> 32); \
92-
___res += (___x & 0xffffffff) * (___m >> 32); \
93-
___t = (___res < ___t) ? (1ULL << 32) : 0; \
94-
___res = (___res >> 32) + ___t; \
95-
___res += (___m >> 32) * (___x >> 32); \
96-
___res /= ___p; \
91+
___res = (___m & 0xffffffff) * (___x & 0xffffffff); \
92+
___t = (___m & 0xffffffff) * (___x >> 32) + (___res >> 32); \
93+
___res = (___m >> 32) * (___x >> 32) + (___t >> 32); \
94+
___t = (___m >> 32) * (___x & 0xffffffff) + (___t & 0xffffffff);\
95+
___res = (___res + (___t >> 32)) / ___p; \
9796
\
98-
/* Now sanitize and optimize what we've got. */ \
99-
if (~0ULL % (___b / (___b & -___b)) == 0) { \
100-
/* special case, can be simplified to ... */ \
101-
___n /= (___b & -___b); \
102-
___m = ~0ULL / (___b / (___b & -___b)); \
103-
___p = 1; \
104-
___bias = 1; \
105-
} else if (___res != ___x / ___b) { \
97+
/* Now validate what we've got. */ \
98+
if (___res != ___x / ___b) { \
10699
/* \
107100
* We can't get away without a bias to compensate \
108101
* for bit truncation errors. To avoid it we'd need an \
@@ -111,45 +104,18 @@
111104
* \
112105
* Instead we do m = p / b and n / b = (n * m + m) / p. \
113106
*/ \
114-
___bias = 1; \
107+
___bias = true; \
115108
/* Compute m = (p << 64) / b */ \
116109
___m = (~0ULL / ___b) * ___p; \
117110
___m += ((~0ULL % ___b + 1) * ___p) / ___b; \
118-
} else { \
119-
/* \
120-
* Reduce m / p, and try to clear bit 31 of m when \
121-
* possible, otherwise that'll need extra overflow \
122-
* handling later. \
123-
*/ \
124-
uint32_t ___bits = -(___m & -___m); \
125-
___bits |= ___m >> 32; \
126-
___bits = (~___bits) << 1; \
127-
/* \
128-
* If ___bits == 0 then setting bit 31 is unavoidable. \
129-
* Simply apply the maximum possible reduction in that \
130-
* case. Otherwise the MSB of ___bits indicates the \
131-
* best reduction we should apply. \
132-
*/ \
133-
if (!___bits) { \
134-
___p /= (___m & -___m); \
135-
___m /= (___m & -___m); \
136-
} else { \
137-
___p >>= ilog2(___bits); \
138-
___m >>= ilog2(___bits); \
139-
} \
140-
/* No bias needed. */ \
141-
___bias = 0; \
142111
} \
143112
\
113+
/* Reduce m / p to help avoid overflow handling later. */ \
114+
___p /= (___m & -___m); \
115+
___m /= (___m & -___m); \
116+
\
144117
/* \
145-
* Now we have a combination of 2 conditions: \
146-
* \
147-
* 1) whether or not we need to apply a bias, and \
148-
* \
149-
* 2) whether or not there might be an overflow in the cross \
150-
* product determined by (___m & ((1 << 63) | (1 << 31))). \
151-
* \
152-
* Select the best way to do (m_bias + m * n) / (1 << 64). \
118+
* Perform (m_bias + m * n) / (1 << 64). \
153119
* From now on there will be actual runtime code generated. \
154120
*/ \
155121
___res = __arch_xprod_64(___m, ___n, ___bias); \
@@ -165,7 +131,7 @@
165131
* Semantic: retval = ((bias ? m : 0) + m * n) >> 64
166132
*
167133
* The product is a 128-bit value, scaled down to 64 bits.
168-
* Assuming constant propagation to optimize away unused conditional code.
134+
* Hoping for compile-time optimization of conditional code.
169135
* Architectures may provide their own optimized assembly implementation.
170136
*/
171137
static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
@@ -174,38 +140,28 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
174140
uint32_t m_hi = m >> 32;
175141
uint32_t n_lo = n;
176142
uint32_t n_hi = n >> 32;
177-
uint64_t res;
178-
uint32_t res_lo, res_hi, tmp;
179-
180-
if (!bias) {
181-
res = ((uint64_t)m_lo * n_lo) >> 32;
182-
} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
183-
/* there can't be any overflow here */
184-
res = (m + (uint64_t)m_lo * n_lo) >> 32;
185-
} else {
186-
res = m + (uint64_t)m_lo * n_lo;
187-
res_lo = res >> 32;
188-
res_hi = (res_lo < m_hi);
189-
res = res_lo | ((uint64_t)res_hi << 32);
190-
}
191-
192-
if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
193-
/* there can't be any overflow here */
194-
res += (uint64_t)m_lo * n_hi;
195-
res += (uint64_t)m_hi * n_lo;
196-
res >>= 32;
143+
uint64_t x, y;
144+
145+
/* Determine if overflow handling can be dispensed with. */
146+
bool no_ovf = __builtin_constant_p(m) &&
147+
((m >> 32) + (m & 0xffffffff) < 0x100000000);
148+
149+
if (no_ovf) {
150+
x = (uint64_t)m_lo * n_lo + (bias ? m : 0);
151+
x >>= 32;
152+
x += (uint64_t)m_lo * n_hi;
153+
x += (uint64_t)m_hi * n_lo;
154+
x >>= 32;
155+
x += (uint64_t)m_hi * n_hi;
197156
} else {
198-
res += (uint64_t)m_lo * n_hi;
199-
tmp = res >> 32;
200-
res += (uint64_t)m_hi * n_lo;
201-
res_lo = res >> 32;
202-
res_hi = (res_lo < tmp);
203-
res = res_lo | ((uint64_t)res_hi << 32);
157+
x = (uint64_t)m_lo * n_lo + (bias ? m_lo : 0);
158+
y = (uint64_t)m_lo * n_hi + (uint32_t)(x >> 32) + (bias ? m_hi : 0);
159+
x = (uint64_t)m_hi * n_hi + (uint32_t)(y >> 32);
160+
y = (uint64_t)m_hi * n_lo + (uint32_t)y;
161+
x += (uint32_t)(y >> 32);
204162
}
205163

206-
res += (uint64_t)m_hi * n_hi;
207-
208-
return res;
164+
return x;
209165
}
210166
#endif
211167

0 commit comments

Comments
 (0)