include/linux/reciprocal_div.h - linux/torvalds/linux - Git at Google

 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef _LINUX_RECIPROCAL_DIV_H
 #define _LINUX_RECIPROCAL_DIV_H

 #include <linux/types.h>

 /*
  * This algorithm is based on the paper "Division by Invariant
  * Integers Using Multiplication" by Torbjörn Granlund and Peter
  * L. Montgomery.
  *
  * The assembler implementation from Agner Fog, which this code is
  * based on, can be found here:
  * http://www.agner.org/optimize/asmlib.zip
  *
  * This optimization for A/B is helpful if the divisor B is mostly
  * runtime invariant. The reciprocal of B is calculated in the
  * slow-path with reciprocal_value(). The fast-path can then just use
  * a much faster multiplication operation with a variable dividend A
  * to calculate the division A/B.
  */

 struct reciprocal_value {
 	u32 m;
 	u8 sh1, sh2;
 };

 /* "reciprocal_value" and "reciprocal_divide" together implement the basic
  * version of the algorithm described in Figure 4.1 of the paper.
  */
 struct reciprocal_value reciprocal_value(u32 d);

 static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
 {
 	u32 t = (u32)(((u64)a * R.m) >> 32);
 	return (t + ((a - t) >> R.sh1)) >> R.sh2;
 }

 struct reciprocal_value_adv {
 	u32 m;
 	u8 sh, exp;
 	bool is_wide_m;
 };

 /* "reciprocal_value_adv" implements the advanced version of the algorithm
  * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
  * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
  * exception case could be easily handled before calling "reciprocal_value_adv".
  *
  * The advanced version requires more complex calculation to get the reciprocal
  * multiplier and other control variables, but then could reduce the required
  * emulation operations.
  *
  * It makes no sense to use this advanced version for host divide emulation,
  * those extra complexities for calculating multiplier etc could completely
  * waive our saving on emulation operations.
  *
  * However, it makes sense to use it for JIT divide code generation for which
  * we are willing to trade performance of JITed code with that of host. As shown
  * by the following pseudo code, the required emulation operations could go down
  * from 6 (the basic version) to 3 or 4.
  *
  * To use the result of "reciprocal_value_adv", suppose we want to calculate
  * n/d, the pseudo C code will be:
  *
  *   struct reciprocal_value_adv rvalue;
  *   u8 pre_shift, exp;
  *
  *   // handle exception case.
  *   if (d >= (1U << 31)) {
  *     result = n >= d;
  *     return;
  *   }
  *
  *   rvalue = reciprocal_value_adv(d, 32)
  *   exp = rvalue.exp;
  *   if (rvalue.is_wide_m && !(d & 1)) {
  *     // floor(log2(d & (2^32 -d)))
  *     pre_shift = fls(d & -d) - 1;
  *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
  *   } else {
  *     pre_shift = 0;
  *   }
  *
  *   // code generation starts.
  *   if (imm == 1U << exp) {
  *     result = n >> exp;
  *   } else if (rvalue.is_wide_m) {
  *     // pre_shift must be zero when reached here.
  *     t = (n * rvalue.m) >> 32;
  *     result = n - t;
  *     result >>= 1;
  *     result += t;
  *     result >>= rvalue.sh - 1;
  *   } else {
  *     if (pre_shift)
  *       result = n >> pre_shift;
  *     result = ((u64)result * rvalue.m) >> 32;
  *     result >>= rvalue.sh;
  *   }
  */
 struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);

 #endif /* _LINUX_RECIPROCAL_DIV_H */
	/* SPDX-License-Identifier: GPL-2.0 */
	#ifndef _LINUX_RECIPROCAL_DIV_H
	#define _LINUX_RECIPROCAL_DIV_H

	#include <linux/types.h>

	/*
	* This algorithm is based on the paper "Division by Invariant
	* Integers Using Multiplication" by Torbjörn Granlund and Peter
	* L. Montgomery.
	*
	* The assembler implementation from Agner Fog, which this code is
	* based on, can be found here:
	* http://www.agner.org/optimize/asmlib.zip
	*
	* This optimization for A/B is helpful if the divisor B is mostly
	* runtime invariant. The reciprocal of B is calculated in the
	* slow-path with reciprocal_value(). The fast-path can then just use
	* a much faster multiplication operation with a variable dividend A
	* to calculate the division A/B.
	*/

	struct reciprocal_value {
	u32 m;
	u8 sh1, sh2;
	};

	/* "reciprocal_value" and "reciprocal_divide" together implement the basic
	* version of the algorithm described in Figure 4.1 of the paper.
	*/
	struct reciprocal_value reciprocal_value(u32 d);

	static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
	{
	u32 t = (u32)(((u64)a * R.m) >> 32);
	return (t + ((a - t) >> R.sh1)) >> R.sh2;
	}

	struct reciprocal_value_adv {
	u32 m;
	u8 sh, exp;
	bool is_wide_m;
	};

	/* "reciprocal_value_adv" implements the advanced version of the algorithm
	* described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
	* ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
	* exception case could be easily handled before calling "reciprocal_value_adv".
	*
	* The advanced version requires more complex calculation to get the reciprocal
	* multiplier and other control variables, but then could reduce the required
	* emulation operations.
	*
	* It makes no sense to use this advanced version for host divide emulation,
	* those extra complexities for calculating multiplier etc could completely
	* waive our saving on emulation operations.
	*
	* However, it makes sense to use it for JIT divide code generation for which
	* we are willing to trade performance of JITed code with that of host. As shown
	* by the following pseudo code, the required emulation operations could go down
	* from 6 (the basic version) to 3 or 4.
	*
	* To use the result of "reciprocal_value_adv", suppose we want to calculate
	* n/d, the pseudo C code will be:
	*
	* struct reciprocal_value_adv rvalue;
	* u8 pre_shift, exp;
	*
	* // handle exception case.
	* if (d >= (1U << 31)) {
	* result = n >= d;
	* return;
	* }
	*
	* rvalue = reciprocal_value_adv(d, 32)
	* exp = rvalue.exp;
	* if (rvalue.is_wide_m && !(d & 1)) {
	* // floor(log2(d & (2^32 -d)))
	* pre_shift = fls(d & -d) - 1;
	* rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
	* } else {
	* pre_shift = 0;
	* }
	*
	* // code generation starts.
	* if (imm == 1U << exp) {
	* result = n >> exp;
	* } else if (rvalue.is_wide_m) {
	* // pre_shift must be zero when reached here.
	* t = (n * rvalue.m) >> 32;
	* result = n - t;
	* result >>= 1;
	* result += t;
	* result >>= rvalue.sh - 1;
	* } else {
	* if (pre_shift)
	* result = n >> pre_shift;
	* result = ((u64)result * rvalue.m) >> 32;
	* result >>= rvalue.sh;
	* }
	*/
	struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);

	#endif /* _LINUX_RECIPROCAL_DIV_H */