chiark - git - mdw - catacomb/blob - base/permute.h

   1 /* -*-c-*-
   2  *
   3  * Bit permutations
   4  *
   5  * (c) 2024 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software: you can redistribute it and/or modify it
  13  * under the terms of the GNU Library General Public License as published
  14  * by the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful, but
  18  * WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb.  If not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  25  * USA.
  26  */
  27
  28 #ifndef CATACOMB_PERMUTE_H
  29 #define CATACOMB_PERMUTE_H
  30
  31 #ifdef __cplusplus
  32   extern "C" {
  33 #endif
  34
  35 /*----- Header files ------------------------------------------------------*/
  36
  37 #include <mLib/macros.h>
  38
  39 /*----- Macros provided ---------------------------------------------------*/
  40
  41 /* --- Theory lesson --- *
  42  *
  43  * It's often useful to rearrange the bits in a word, or a value split across
  44  * two (or more) words, so it's worth taking a moment to consider how this
  45  * might be done efficiently.  Throughout this discussion, we use the
  46  * standard bit numbering, where the least significant bit in a word is bit
  47  * zero, with numbering increasing with significance.  Equivalently, bit
  48  * %$k$% has the numerical value %$2^k$%.
  49  *
  50  * An essential primitive is the `swizzle', which exchanges two similarly
  51  * arranged but disjoint groups of bits within a word which are separated by
  52  * a constant offset.  The groups of bits don't have to be contiguous, but
  53  * they must be identified by shifts of the same mask.
  54  *
  55  * An especially important class of swizzle permutations considers the
  56  * individual bits of bit indices.  Permutations of the bits in a word can be
  57  * interpreted as operations on the bits of the indices.  For %$i \ge 0$%,
  58  * let %$\mu_i$% be the mask such that bit %$k$% of %$\mu_i$% is set if and
  59  * only if bit %$i$% is clear in %$k$%.  Hence
  60  *
  61  *      %$\mu_0 = (\ldots 01010101010101010101010101010101)_2 = -1/3$% ,
  62  *      %$\mu_1 = (\ldots 00110011001100110011001100110011)_2 = -1/5$% ,
  63  *      %$\mu_2 = (\ldots 00001111000011110000111100001111)_2 = -1/17$% ,
  64  *      %$\mu_3 = (\ldots 00000000111111110000000011111111)_2 = -1/257$% ,
  65  *      etc.
  66  *
  67  * In general, the low %$2^i$% bits of %$\mu_i$% are set, the next least
  68  * significant %$2^i$% bits are clear, the next %$2^i$% bits are set, and so
  69  * on.  Hence, %$\mu_i \lsl 2^i = \bar{\mu}_i$%, or, in the %$2$%-adic
  70  * numbers %$\Z_2$%, %$2^{2^i} \mu_k = -1 - \mu_i$%, whence, in general,
  71  * %$\mu_i = -1/(2^{2^i} + 1)$%.  Let %$x$% be some binary value; now we can
  72  * describe some important swizzles.
  73  *
  74  *   * Let %$y=(x\bitand\mu_i)\lsl 2^i \bitor (x\bitand\bar{\mu}_i)\lsr 2^i%,
  75  *     or %$y = (x\bitand\mu_i) \lsl 2^k \bitor (x \lsr 2^k)\bitand\mu_i$%.
  76  *     This exchanges the two sub-blocks of %$2^i$% bits in each %$2^{i+1}$%
  77  *     block in %$x$%.  In terms of indices, now the bits at indices in which
  78  *     bit %$i$% is set precede those in which bit %$i$% is clear.
  79  *     %%\emph{We have inverted index bit %$i$%.}%%
  80  *
  81  *   * Suppose that %$i < j$%, and let %$m = \bar{\mu}_i \bitand \mu_j$% and
  82  *     %$s = 2^j - 2^i$%; let %$y = (x \bitand m) \lsl s \bitor {}$%
  83  *     %$(x \lsr s)\bitand m \bitor (x\bitand \overline{m \bitor m \lsl s$%.
  84  *     Now, %$m$% has its bit %$k$% set if and only if bit %$i$% of %$k$% is
  85  *     set and bit %$j$% of %$k$% is clear.  The related mask %$m \lsl s$%
  86  *     has bit %$k + s$% set if %$k% has the same property; but, %$k$%
  87  *     will have bit %$i$% set and bit %$j$% clear if and only if bit %$i$%
  88  *     is clear and bit %$j$% is set in %$k + s$%.  Combined, the mask
  89  *     %$m \bitor (m \lsl s)$% selects bits at indices in which bits %$i$%
  90  *     and %$j$% differ, so %$\overline{m \bitor (m \lsl s)}$% selects the
  91  *     bits at indices where bits %$i$% and %$j$% are equal.
  92  *
  93  *     This swizzle therefore exchanges the bits of %$x$% at indices where
  94  *     bit %$i$% is set and bit %$j$% is clear with those at indices where
  95  *     bit %$j$% is set and %$i$% is clear, leaving alone those bits at
  96  *     indices where bits %$i$% and %$j$% are either both clear or both set.
  97  *     %%\emph{We have exchanged index bits %$i$% and %$j$%.}%%
  98  *
  99  *   * Rounding off this little collection, suppose again that %$i < j$%, and
 100  *     let %$m = \mu_i \bitand \mu_j$% and %$s = 2^i + 2^j$%; and again, let
 101  *     %$y = (x \bitand m) \lsl s \bitor (x \lsr s) \bitand m \bitor {}$%
 102  *     %$(x \bitand \overline{m \bitor m \lsl s$%.  Now, %$m$% has its bit
 103  *     %$k$% set if and only if bits %$i$% and %$j$% of %$k$% are both clear.
 104  *     This swizzle therefore exchanges the bits of %$x$% at indices where
 105  *     bits %$i$% and %$j$% are both clear with those at indices where
 106  *     bits %$i$% and %$j$% are both set, leaving alone those bits at indices
 107  *     where bits %$i$% and %$j$% differ.  It takes a little work to (left as
 108  *     an exercise) to see that the effect combines the previous two.
 109  *     %%\emph{We have exchanged and inverted index bits %$i$% and %$j$%.}%%
 110  *
 111  * Related is the `twizzle', which exchanges similarly arranged groups of
 112  * bits within two different words.  This can be seen as a multiprecision
 113  * variant of the swizzle.
 114  *
 115  * Finally, we consider general permutations.  These can be implemented using
 116  * Beneš networks.  Pick some index bit number %$i$%.  By applying a swizzle
 117  * with a shift by %$2^i$% to the inputs, and another to the outputs, we can
 118  * reduce the problem to finding two independent permutations, one affecting
 119  * bits whose index has bit %$i$% clear, and the other affecting bits whose
 120  * index has bit %$i$% set.  This doesn't sound so helpful, except that (a)
 121  * the smaller permutations can each be implemented in the same way, and (b)
 122  * they can be performed in parallel.  Small Beneš networks can be
 123  * constructed by hand, but computer assistance is useful for larger ones;
 124  * there are some utilities in `utils/benes.lisp'.
 125  *
 126  * The machinery here expects some parameters to have been defined:
 127  *
 128  *   * @regty@ should be an unsigned integer type, and
 129  *
 130  *   * @REGWD@ should be a power of two such that @regty@ can store at least
 131  *     @REGWD@ bits.
 132  */
 133
 134 /* We begin with some internal utilities.  @CATACOMB__REPLICATE_n_(x)@
 135  * produces a hexadecimal constant consisting of %$n$% copies of the digits
 136  * @x@.
 137  */
 138 #define CATACOMB__REPLICATE_16_(x) CATACOMB__REPLICATE_8_(GLUE(x, x))
 139 #define CATACOMB__REPLICATE_8_(x) CATACOMB__REPLICATE_4_(GLUE(x, x))
 140 #define CATACOMB__REPLICATE_4_(x) CATACOMB__REPLICATE_2_(GLUE(x, x))
 141 #define CATACOMB__REPLICATE_2_(x) CATACOMB__REPLICATE_1_(GLUE(x, x))
 142 #define CATACOMB__REPLICATE_1_(x) GLUE(0x, x)
 143
 144 /* More internal utilities.  @CATACOMB__REPLi_Un(x)@ returns an %$n$%-bit
 145  * hexadecimal constant formed by replicating the %$i$%-bit constant (which
 146  * must have leading zeros) %$n/i$% times.
 147  */
 148 #define CATACOMB__REPL8_U8 CATACOMB__REPLICATE_1_
 149 #define CATACOMB__REPL8_U16 CATACOMB__REPLICATE_2_
 150 #define CATACOMB__REPL8_U32 CATACOMB__REPLICATE_4_
 151 #define CATACOMB__REPL8_U64 CATACOMB__REPLICATE_8_
 152 #define CATACOMB__REPL8_U128 CATACOMB__REPLICATE_16_
 153
 154 #define CATACOMB__REPL16_U16 CATACOMB__REPLICATE_1_
 155 #define CATACOMB__REPL16_U32 CATACOMB__REPLICATE_2_
 156 #define CATACOMB__REPL16_U64 CATACOMB__REPLICATE_4_
 157 #define CATACOMB__REPL16_U128 CATACOMB__REPLICATE_8_
 158
 159 #define CATACOMB__REPL32_U32 CATACOMB__REPLICATE_1_
 160 #define CATACOMB__REPL32_U64 CATACOMB__REPLICATE_2_
 161 #define CATACOMB__REPL32_U128 CATACOMB__REPLICATE_4_
 162
 163 #define CATACOMB__REPL64_U64 CATACOMB__REPLICATE_1_
 164 #define CATACOMB__REPL64_U128 CATACOMB__REPLICATE_2_
 165
 166 #define CATACOMB__REPL128_U128 CATACOMB__REPLICATE_1_
 167
 168 /* Finally, @CATACOMB__REPLi(x)@ returns a hexadecimal constant formed by
 169  * replicating the %$i$%-bit constant (including leading zeros) sufficiently
 170  * many times as to fill a @REGWD@-bit wide register.
 171  */
 172 #define CATACOMB__REPL8(x) GLUE(CATACOMB__REPL8_U, REGWD)(x)
 173 #define CATACOMB__REPL16(x) GLUE(CATACOMB__REPL16_U, REGWD)(x)
 174 #define CATACOMB__REPL32(x) GLUE(CATACOMB__REPL32_U, REGWD)(x)
 175 #define CATACOMB__REPL64(x) GLUE(CATACOMB__REPL64_U, REGWD)(x)
 176 #define CATACOMB__REPL128(x) GLUE(CATACOMB__REPL128_U, REGWD)(x)
 177
 178 /* The macro @CATACOMB__IXMASK_Bi(_)@ evaluates to the low @REGWD@ bits of
 179  * the constant %$\mu_i$% defined above.  The argument is ignored; it's
 180  * necessary to prevent technical problems with macro expansion
 181  * (specifically, to allow the blue paint on @GLUE@ to be washed off before
 182  * invoking @CATACOMB__REPLi@).
 183  */
 184 #define CATACOMB__IXMASK_B0(_) CATACOMB__REPL8(55)
 185 #define CATACOMB__IXMASK_B1(_) CATACOMB__REPL8(33)
 186 #define CATACOMB__IXMASK_B2(_) CATACOMB__REPL8(0f)
 187 #define CATACOMB__IXMASK_B3(_) CATACOMB__REPL16(00ff)
 188 #define CATACOMB__IXMASK_B4(_) CATACOMB__REPL32(0000ffff)
 189 #define CATACOMB__IXMASK_B5(_) CATACOMB__REPL64(00000000ffffffff)
 190 #define CATACOMB__IXMASK_B6(_)                                          \
 191         CATACOMB__REPL128(0000000000000000ffffffffffffffff)
 192
 193 /* @IXMASK(i)@ returns the low @REGWD@ bits of %$\mu_i$%.  The argument @i@
 194  * must be a decimal integer constant, without leading zeros.
 195  */
 196 #define IXMASK(i) GLUE(CATACOMB__IXMASK_B, i)(hunoz)
 197
 198 /* @IXMASK_xy(i, j)@ returns a @REGWD@-bit mask in which bit %$k$% is set if
 199  * bit %$i$% of %$k$% is equal to %$x$% and bit %$j$% of %$k$% is equal to
 200  * %$y$%.  The arguments @i@ and @j@ must be decimal integer constants,
 201  * without leading zeros.
 202  */
 203 #define IXMASK_00(i, j) (IXMASK(i)&IXMASK(j))
 204 #define IXMASK_01(i, j) (IXMASK(i)&~IXMASK(j))
 205 #define IXMASK_10(i, j) (~IXMASK(i)&IXMASK(j))
 206 #define IXMASK_11(i, j) (~IXMASK(i)&~IXMASK(j))
 207
 208 /* The general swizzle operation.  Exchange the bits in @x@ selected by
 209  * @mask@ with those selected by @mask << shift@.
 210  */
 211 #define SWIZZLE(x, shift, mask) do {                                    \
 212   regty _t = ((x) ^ ((x) >> (shift)))&(mask);                           \
 213   (x) ^= _t | (_t << (shift));                                          \
 214 } while (0)
 215
 216 /* A swizzle on two words @x@ and @y@, using the same shift, but different
 217  * masks @mask0@ and @mask1@.  This is just a simple abbreviation.
 218  */
 219 #define SWIZZLE_2(x, y, shift, mask0, mask1) do {                       \
 220   SWIZZLE(x, shift, mask0); SWIZZLE(y, shift, mask1);                   \
 221 } while (0)
 222
 223 /* A `twizzle', or a swizzle across two words.
 224  *
 225  * The @TWIZZLE_0@ macro exchanges the bits of @x@ and @y selected by
 226  * @mask@.  The @TWIZZLE_L@ and @TWIZZLE_R@ macros exchange the bits selected
 227  * by @mask@ in @y@ with the bits in @x@ selected by @mask << shift@ or
 228  * @mask >> shift@ respectively.  (The names are from the direction in which
 229  * @x@ is shifted, not the direction the mask is shifted.)
 230  *
 231  * These are used to synthesize swizzles within multiprecision words: if the
 232  * intended shift is %$a w + b$%, where %$w$% is the word width, then %$a$%
 233  * gives the difference between word indices of the words to be processed,
 234  * and %$\abs{b$}% gives the @shift@ argument; use @TWIZZLE_R@ if
 235  * %$b \ge 0$%, @TWIZZLE_L@ if %$b \le 0$% is nonpositive, or @TWIZZLE_0@ if
 236  * %$b = 0$%.  (We can easily distinguish which of %$a w \pm b$% or
 237  * %$(a \pm 1) w \mp (w - b)$%, since one kind of shift will keep @mask@
 238  * within the same word, and the other will shift it out completely.)
 239  */
 240 #define TWIZZLE_0(x, y, mask) do {                                      \
 241   regty _t = ((y) ^ ((x)))&(mask);                                      \
 242   (x) ^= _t; (y) ^= _t;                                                 \
 243 } while (0)
 244 #define TWIZZLE_L(x, y, shift, mask) do {                               \
 245   regty _t = ((y) ^ ((x) << (shift)))&(mask);                           \
 246   (x) ^= _t >> (shift); (y) ^= _t;                                      \
 247 } while (0)
 248 #define TWIZZLE_R(x, y, shift, mask) do {                               \
 249   regty _t = ((y) ^ ((x) >> (shift)))&(mask);                           \
 250   (x) ^= _t << (shift); (y) ^= _t;                                      \
 251 } while (0)
 252
 253 /* @SWIZZLE_CPL@ applies a swizzle to @x@ which complements index bit @i@;
 254  * @SWIZZLE_EXCH@ applies a swizzle to exchange index bits @i@ and @j@; and
 255  * @SWIZZLE_XCPL@ applies a swizzle to exchange and invert index bits @i@ and
 256  * @j@.  The arguments @i@ and @j@ must be decimal integer constants without
 257  * leading zeros, with %$i \le j$%.  (The macros do nothing if %$i = j$%.)
 258  *
 259  * The variants with @2@ in their names act identically on @x@ and @y@, and
 260  * are intended as a simple convenience.
 261  */
 262 #define SWIZZLE_CPL(x, i) SWIZZLE(x, (1 << (i)), IXMASK(i))
 263 #define SWIZZLE_EXCH(x, i, j)                                           \
 264         SWIZZLE(x, (1 << (j)) - (1 << (i)), IXMASK_10(i, j))
 265 #define SWIZZLE_XCPL(x, i, j)                                           \
 266         SWIZZLE(x, (1 << (j)) + (1 << (i)), IXMASK_00(i, j))
 267
 268 #define SWIZZLE_CPL2(x, y, i)                                           \
 269         SWIZZLE_2(x, y, (1 << (i)), IXMASK(i), IXMASK(i))
 270 #define SWIZZLE_EXCH2(x, y, i, j)                                       \
 271         SWIZZLE_2(x, y, (1 << (j)) - (1 << (i)),                        \
 272                   IXMASK_10(i, j), IXMASK_10(i, j))
 273 #define SWIZZLE_XCPL2(x, y, i, j)                                       \
 274         SWIZZLE_2(x, y, (1 << (j)) + (1 << (i)),                        \
 275                   IXMASK_00(i, j), IXMASK_00(i, j))
 276
 277 /* The @TWIZZLE_EXCH@ and @TWIZZLE_XCPL@ macros act like the similarly named
 278  * @SWIZZLE_...@ macros above, except that (a) they act on two words from a
 279  * multiprecision value, and the @j@ index is implicit in the selection of
 280  * the operands @x@ and @y@.  If the word width is %$w$%, and %$2^j = n w$%,
 281  * then %$x$% should be chosen to be %$n$% slots more significant than
 282  * %$y$%.
 283  *
 284  * Note that there is no @TWIZZLE_CPL@, since this would simply involve
 285  * exchanging two entries in an array.
 286  */
 287 #define TWIZZLE_EXCH(x, y, i) TWIZZLE_L(x, y, (1 << (i)), ~IXMASK(i))
 288 #define TWIZZLE_XCPL(x, y, i) TWIZZLE_R(x, y, (1 << (i)), IXMASK(i))
 289
 290 /*----- That's all, folks -------------------------------------------------*/
 291
 292 #ifdef __cplusplus
 293   }
 294 #endif
 295
 296 #endif