[storin] / storin.tex

%%% -*-latex-*-
%%%
%%% $Id: storin.tex,v 1.7 2001/03/11 23:46:56 mdw Exp $
%%%
%%% Definition of the cipher
%%%
%%% (c) 2000 Mark Wooding
%%%

%%%----- Revision history ---------------------------------------------------
%%%
%%% $Log: storin.tex,v $
%%% Revision 1.7  2001/03/11 23:46:56  mdw
%%% Fixing to BibTeX stuff.
%%%
%%% Revision 1.6  2001/03/11 23:22:53  mdw
%%% Use BibTeX for the paper bibliography.
%%%
%%% Revision 1.5  2000/07/02 15:22:34  mdw
%%% Overhaul of differential cryptanalysis, including a new attack.
%%%
%%% Revision 1.4  2000/05/28 00:39:32  mdw
%%% Fix some errors.
%%%
%%% Revision 1.3  2000/05/25 19:46:22  mdw
%%% Improve analysis section.
%%%
%%% Revision 1.2  2000/05/21 21:43:26  mdw
%%% Fix a couple of typos.
%%%
%%% Revision 1.1  2000/05/21 11:28:30  mdw
%%% Initial check-in.
%%%

%%%----- Preamble -----------------------------------------------------------

\documentclass[a4paper]{article}
\usepackage[palatino, helvetica, courier, maths=cmr]{mdwfonts}
\usepackage{mdwtab}
\usepackage{mathenv}
\usepackage{amsfonts}
\usepackage{mdwmath}
\usepackage{url}
\usepackage[all, dvips]{xy}

\def\ror{\mathbin{>\!\!>\!\!>}}
\def\rol{\mathbin{<\!\!<\!\!<}}
\def\lsr{\mathbin{>\!\!>}}
\def\lsl{\mathbin{<\!\!<}}
\def\xor{\oplus}
\def\seq#1{{\langle #1 \rangle}}

\def\hex#1{\texttt{#1}_{16}}
\let\msgid=\url

\sloppy

\title{Storin: A block cipher for digital signal processors}
\author{Mark Wooding (\texttt{mdw@nsict.org})}

%% --- The cipher diagrams ---

\def\figkeymix#1#2#3#4{%
  \ar "a"; p-(0, 0.5)*{\xor} ="a" \ar "a"+(1, 0) *+[r]{k_{#1}}; "a"%
  \ar "b"; p-(0, 0.5)*{\xor} ="b" \ar "b"+(1, 0) *+[r]{k_{#2}}; "b"%
  \ar "c"; p-(0, 0.5)*{\xor} ="c" \ar "c"+(1, 0) *+[r]{k_{#3}}; "c"%
  \ar "d"; p-(0, 0.5)*{\xor} ="d" \ar "d"+(1, 0) *+[r]{k_{#4}}; "d"%
}

\def\figmatrix{%
  \POS "a"+(3, -1) *++=(7, 0)[F]u\txt{Matrix multiply} ="m"%
  \ar "a"; "m"+U-(3, 0) \ar "b"; "m"+U-(1, 0)%
  \ar "c"; "m"+U+(1, 0) \ar "d"; "m"+U+(3, 0)%
}

\def\figlintrans{%
  \ar "m"+D-(3, 0); "a"-(0, 2.25)*{\xor} ="a"%
  \POS "a"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "a"+(0, 0.5); p+(1, 0) "x" \ar "x"; "a"%
  \ar "m"+D-(1, 0); "b"-(0, 2.25)*{\xor} ="b"%
  \POS "b"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "b"+(0, 0.5); p+(1, 0) "x" \ar "x"; "b"%
  \ar "m"+D+(1, 0); "c"-(0, 2.25)*{\xor} ="c"%
  \POS "c"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "c"+(0, 0.5); p+(1, 0) "x" \ar "x"; "c"%
  \ar "m"+D+(3, 0); "d"-(0, 2.25)*{\xor} ="d"%
  \POS "d"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "d"+(0, 0.5); p+(1, 0) "x" \ar "x"; "d"%
}

\def\figilintrans{%
  \ar "a"; "a"-(0, 1)*{\xor} ="a"%
  \POS "a"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "a"+(0, 0.5); p+(1, 0) "x" \ar "x"; "a"%
  \ar "b"; "b"-(0, 1)*{\xor} ="b"%
  \POS "b"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "b"+(0, 0.5); p+(1, 0) "x" \ar "x"; "b"%
  \ar "c"; "c"-(0, 1)*{\xor} ="c"%
  \POS "c"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "c"+(0, 0.5); p+(1, 0) "x" \ar "x"; "c"%
  \ar "d"; "d"-(0, 1)*{\xor} ="d"%
  \POS "d"+(1, 0) *+[F]{{} \lsr 12} ="x"%
  \ar `r "d"+(0, 0.5); p+(1, 0) "x" \ar "x"; "d"%
}

\def\figstart#1{%
  \POS 0;<1cm,0cm>:%
  \turnradius{4pt}%
  \ar @{-} (0, 0) *+{a#1}; p-(0, 0.5) ="a"
  \ar @{-} (2, 0) *+{b#1}; p-(0, 0.5) ="b"
  \ar @{-} (4, 0) *+{c#1}; p-(0, 0.5) ="c"
  \ar @{-} (6, 0) *+{d#1}; p-(0, 0.5) ="d"
}

\def\figround#1#2#3#4#5{%
  \ar @{.} "a"-(0.5, 0); p+(8, 0)%
  \POS "a"+(8, -1.75) *[r]\txt{#5}%
  \figkeymix{#1}{#2}{#3}{#4}%
  \figmatrix%
  \figlintrans%
  \ar @{-} "a"; p-(0, .5) ="a"
  \ar @{-} "b"; p-(0, .5) ="b"
  \ar @{-} "c"; p-(0, .5) ="c"
  \ar @{-} "d"; p-(0, .5) ="d"
}

\def\figiround#1#2#3#4#5{%
  \ar @{.} "a"-(0.5, 0); p+(8, 0)%
  \POS "a"+(8, -1.75) *[r]\txt{#5}%
  \figkeymix{#1}{#2}{#3}{#4}%
  \figilintrans%
  \figmatrix%
  \ar @{-} "m"+D-(3, 0); p-(0, .5) ="a"
  \ar @{-} "m"+D-(1, 0); p-(0, .5) ="b"
  \ar @{-} "m"+D+(1, 0); p-(0, .5) ="c"
  \ar @{-} "m"+D+(3, 0); p-(0, .5) ="d"
}

\def\figgap{%
  \ar @{.} "a"-(0.5, 0); p+(8, 0)
  \POS "a"+(8, -1)*[r]\txt{Six more rounds}
  \ar @{--} "a"; "a"-(0, 2) ="a"
  \ar @{--} "b"; "b"-(0, 2) ="b"
  \ar @{--} "c"; "c"-(0, 2) ="c"
  \ar @{--} "d"; "d"-(0, 2) ="d"
}

\def\figwhite#1#2#3#4#5{%
  \ar @{.} "a"-(0.5, 0); p+(8, 0)
  \POS "a"+(8, -1)*[r]\txt{Postwhitening}
  \figkeymix{#1}{#2}{#3}{#4}
  \ar "a"; p-(0, 1) *+{a#5}
  \ar "b"; p-(0, 1) *+{b#5}
  \ar "c"; p-(0, 1) *+{c#5}
  \ar "d"; p-(0, 1) *+{d#5}
}

\begin{document}
\maketitle

%%%----- The main text ------------------------------------------------------

\begin{abstract}
  We present Storin: a new 96-bit block cipher designed to play to the
  strengths of current digital signal processors (DSPs).  In particular, DSPs
  tend to provide single-cycle multiply-and-accumulate operations, making
  matrix multiplications very cheap.  Working in an environment where
  multiplication is as fast as exclusive-or changes the usual perceptions
  about which operations provide good cryptographic strength cheaply.  The
  scarcity of available memory, for code and for tables, and a penalty for
  nonsequential access to data also make traditional block ciphers based
  around substitution tables unsuitable.
\end{abstract}

\tableofcontents

\section{Definition of the cipher}

\subsection{Overview}

Storin is an eight-round SP network operating on 96-bit blocks.  The block
cipher uses 36 24-bit subkey words, derived from a user key by the key
schedule.

The 96-bit input is split into four 24-bit words.  Each round then processes
these four words, using the following three steps:
\begin{enumerate}
\item Mixing in of some key material.  Four 24-bit subkey words are XORed
  with the four data words.
\item A matrix multiplication mod $2^{24}$.  The four words are treated as a
  column vector and premultiplied by a $4 \times 4$ vector using addition and
  multiplication mod $2^{24}$.  This is the main nonlinear step in the
  cipher, and it also provides most of the cipher's diffusion.
\item A simple linear transformation, which replaces each word $x$ by $x \xor
  (x \lsr 12)$.
\end{enumerate}
The four data words output by the final round are XORed with the last four
subkey words in a final postwhitening stage and combined to form the 96-bit
ciphertext.

The cipher structure is shown diagrammatically in figure~\ref{fig:cipher}.

\begin{figure}
\centering
\leavevmode
\begin{xy}
  \xycompile{
    \figstart{}
    \figround{0}{1}{2}{3}{Round 1}
    \figround{4}{5}{6}{7}{Round 2}
    \figgap
    \figwhite{32}{33}{34}{35}{'}}
\end{xy}
\caption{The Storin encryption function}
\label{fig:cipher}
\end{figure}

Since the matrix used in step 2 is chosen to be invertible, the cipher can be
inverted readily, simply by performing the inverse steps in the reverse
order.  Since the postwhitening stage is the same as a key mixing stage,
decryption can be viewed as eight rounds consisting of key mixing, linear
transformation and matrix multiplication, followed by a postwhitening stage.
Thus, the structure of the inverse cipher is very similar to the forwards
cipher, and uses the same components.  The decryption function is shown
diagrammatically in figure~\ref{fig:decipher}.

\begin{figure}
\centering
\leavevmode
\begin{xy}
  \xycompile{
    \figstart{'}
    \figiround{32}{33}{34}{35}{Round 1}
    \figiround{28}{29}{30}{31}{Round 2}
    \figgap
    \figwhite{0}{1}{2}{3}{}}
\end{xy}
\caption{The Storin decryption function}
\label{fig:decipher}
\end{figure}

The key schedule is designed to be simple and to reuse the cipher components
already available.  Given a user key, which is a sequence of one or more
24-bit words, it produces the 36 subkey words required by the cipher.  The
key schedule is very similar to Blowfish \cite{Schneier:1994:DNV}.  The
subkey array is assigned an initial constant value derived from the matrix
used in the cipher.  Words from the user key are XORed into the array,
starting from the beginning, and restarting from the beginning of the user
key when all the user key words are exhausted.  A 96-bit block is initialized
to zero, and enciphered with Storin, using the subkeys currently in the
array.  The first four subkey words are then replaced with the resulting
ciphertext, which is then encrypted again using the new subkeys.  The next
four subkey words are replaced with the ciphertext, and the process
continues, nine times in all, until all of the subkey words have been
replaced.

The Storin key schedule can in theory accept user keys up to 36 words (864
bits) long.  However, there are known problems with keys longer than 28 words
(672 bits), and these large keys are forbidden.  We expect that with long
keys, attacks will be found which are more efficient than an exhaustive
search of the keyspace; we therefore (conservatively) recommend 5 word
(120-bit) keys as a practical maximum.


\subsection{Encryption}

We define $\mathcal{W} = \mathbb{Z}_{2^{24}}$ to be set of 24-bit words, and
$\mathcal{P} = \mathcal{W}^4$ to be the set of four-entry column vectors over
$\mathcal{W}$.  Storin plaintext blocks are members of $\mathcal{P}$.

The Storin encryption function uses 36 24-bit words of key material $k_0$,
$k_1$, \ldots, $k_{35}$, which are produced from the user key by the key
schedule, described below.  The key-mixing operation $K_i \colon \mathcal{P}
\to \mathcal{P}$ is defined for $0 \le i < 9$ by:
\[
  K_i \begin{pmatrix} a \\ b \\ c \\d \end{pmatrix}
  =
  \begin{pmatrix}
    a \xor k_{4i} \\ b \xor k_{4i+1} \\ c \xor k_{4i+2} \\ d \xor k_{4i+3}
  \end{pmatrix}
\]

The matrix multiplication operation $M \colon \mathcal{P} \to \mathcal{P}$
is described by $M(\mathbf{x}) = \mathbf{M} \mathbf{x}$, where $\mathbf{M}$
is a fixed invertible $4 \times 4$ matrix over $\mathcal{W}$.  The value of
$\mathbf{M}$ is defined below.

The linear transformation $L \colon \mathcal{P} \to \mathcal{P}$ is defined by:
\[
  L \begin{pmatrix} a \\ b \\ c \\ d \end{pmatrix}
  =
  \begin{pmatrix}
    a \xor (a \lsr 12) \\
    b \xor (b \lsr 12) \\
    c \xor (c \lsr 12) \\
    d \xor (d \lsr 12)
  \end{pmatrix}
\]

The round function $R_i \colon \mathcal{P} \to \mathcal{P}$ is defined for $0
\le i < 8$ by
\[ \bigl(\mathbf{M} K_i(\mathbf{x}) \bigr) \]

The cipher $C \colon \mathcal{P} \to \mathcal{P}$ is defined in terms of $R_i$ and
$K_i$.  Let $\mathbf{x}_0 \in \mathcal{P}$ be a plaintext vector.  Let
$\mathbf{x}_{i+1} = R_i(\mathbf{x}_i)$ for $0 \le i < 8$.  Then we define
$C(\mathbf{x})$ by setting $C(\mathbf{x}_0) = K_8(\mathbf{x}_8)$.


\subsection{Key schedule}

The key schedule converts a user key, which is a sequence of 24-bit words,
into the 36 subkeys required by the cipher.

For $i \ge 0$, we define that
\[
\begin{pmatrix}
  m_{16i +  0} & m_{16i +  1} & m_{16i +  2} & m_{16i +  3} \\
  m_{16i +  4} & m_{16i +  5} & m_{16i +  6} & m_{16i +  7} \\
  m_{16i +  8} & m_{16i +  9} & m_{16i + 10} & m_{16i + 11} \\
  m_{16i + 12} & m_{16i + 13} & m_{16i + 14} & m_{16i + 15}
\end{pmatrix}
= \mathbf{M}^{i + 2}
\]

Let the user-supplied key be $u_0$, $u_1$, \ldots, $u_{n-1}$, for some $n >
0$.  We define the sequence $z_0$, $z_1$, \ldots\ by
\[ z_i = m_i \xor u_{i \bmod n} \]
for $i \ge 0$.

Denote the result of encrypting vector $\mathbf{x}$ using subkeys from the
sequence $\seq{w} = w_0, w_1, \ldots, w_{35}$ as $C_{\seq{w}}(\mathbf{x})$.
We define the key schedule to be $k_0$, $k_1$, \ldots, $k_{35}$, where:
\begin{eqlines*}
  \seq{p^{(i)}} = k_0, k_1, \ldots, k_{4i-1}, z_{4i}, z_{4i+1}, \ldots \\
  \mathbf{x}_0 = \begin{pmatrix} 0 \\ 0 \\ 0 \\ 0 \end{pmatrix}; \qquad
  \begin{pmatrix} k_{4i} \\ k_{4i+1} \\ k_{4i+2} \\ k_{4i+3} \end{pmatrix}
    = \mathbf{x}_{i+1} = C_{\seq{p^{(i)}}}(\mathbf{x}_i)
\end{eqlines*}


\subsection{Decryption}

The individual operations used during encryption are all invertible.  Key
mixing is inverted by taking keys from the other end of the array:
\[ K^{-1}_i(\mathbf{x}) = K_{8-i}(\mathbf{x}) \]
The matrix multiplication may be inverted simply by using the inverse matrix
$\mathbf{M}^{-1}$:
\[ M^{-1}(\mathbf{x}) = \mathbf{M}^{-1} \mathbf{x} \]
Finally, the linear transformation is its own inverse:
\[ L^{-1}(\mathbf{x}) = L(\mathbf{x}) \]
The inverse round function can now be defined as:
\[ R^{-1}_i(\mathbf{x}) =
    \mathbf{M}^{-1} L\bigl(K^{-1}_i(\mathbf{x})\bigr) \]
    
The decryption function $C^{-1}: \mathcal{P} \to \mathcal{P}$ is defined
in terms of $R^{-1}$ and $K^{-1}$ in a very similar way to encryption.  Let
$\mathbf{x}_0$ be a ciphertext vector.  Let $\mathbf{x}_{i+1} =
R^{-1}_i(\mathbf{x}_i)$ for $0 \le i < 8$.  Then we define
$C^{-1}(\mathbf{x}_0) = K^{-1}_8(\mathbf{x}_8)$.


\subsection{Constants}

The matrix $\mathbf{M}$ and its inverse $\mathbf{M}^{-1}$ are:
\begin{eqnarray*}[rl]
  \mathbf{M} = &
  \begin{pmatrix}
    \hex{f7a413} & \hex{54bd81} & \hex{447550} & \hex{ff4449} \\
    \hex{f31e87} & \hex{d85388} & \hex{de32cb} & \hex{40e3d7} \\
    \hex{d9db1d} & \hex{551b45} & \hex{e9d19f} & \hex{e443de} \\
    \hex{4b949a} & \hex{4d435d} & \hex{ef0a17} & \hex{b784e1}
  \end{pmatrix} \\
  \mathbf{M}^{-1} = &
  \begin{pmatrix}
    \hex{17391b} & \hex{fafb4b} & \hex{a66823} & \hex{f2efb6} \\
    \hex{13e0e5} & \hex{2ed5e4} & \hex{b2cfff} & \hex{d9cdb5} \\
    \hex{2af462} & \hex{33826d} & \hex{de66a1} & \hex{eb6c85} \\
    \hex{c2f423} & \hex{e904a3} & \hex{e772d8} & \hex{d791f1}
  \end{pmatrix}
\end{eqnarray*}


\section{Rationale and analysis}

\subsection{Design decisions}

The initial objective was to produce a cipher which played to the particular
strengths of digital signal processors.  DSPs tend to have good multipliers,
and are particularly good at matrix multiplication.  The decision to use a
matrix multiplication over $\mathbb{Z}_{2^{24}}$ seemed natural, given that
24 bits is a commonly offered word size.

The choice of a 96-bit block is also fairly natural.  A 2 word (48-bit) block
is clearly too small, and a 3 word (72-bit) block is a little on the small
side too.


\subsection{Matrix multiplication over $\mathbb{Z}_{2^{24}}$}

Integer multiplication on a DSP is a cheap source of nonlinearity.  Note that
bit $i$ of the result depends on all of the bits in the operands of lesser or
equal significance.position $i$ downwards.

The decision to make the $4 \times 4$ matrix fixed was taken fairly early on.
Generating invertible matrices from key material seemed like too much work to
expect from the DSP.

The matrix is generated pseudorandomly from a seed string, using SHA-1.  The
criteria we used to choose the matrix are:
\begin{enumerate}
\item The matrix must be invertible.
\item Exactly one entry in each row and column of the matrix must be even.
\end{enumerate}
Criterion 1 is obvious.  Criterion 2 encourages diffusion between the entries
in the block vector.  Note that if a matrix satisfies the second criterion,
its inverse also does.

Consider a vector $\mathbf{x}$ and its product with the matrix $\mathbf{M}
\mathbf{x}$.  Whether the top bit of entry $i$ in $\mathbf{x}$ affects
entry $j$ in the product depends on whether the entry in row $j$, column $i$
of $\mathbf{M}$ is even.  Criterion 2 ensures the following:
\begin{itemize}
\item A top-bit change in a single word affects three words in the output.
\item A top-bit change in two words affects two words in the output.
\end{itemize}

The seed string used is \texttt{matrix-seed-string}.  The program which
generates the matrix is included with the Storin example source code.

\subsection{The linear transformation}

A bit change in one of the inputs to the matrix can only affect bits at that
position and higher in the output.  The linear transformation at the end of
the round aims to provide diffusion from the high-order bits back to the
low-order bits.

A single high-order bit change in the input to a round will affect the
high-order bits of three words in the output of the matrix multiply.  The
linear transformation causes it to affect bits in the low halves of each of
these words.  The second round's multiplication causes these bits to affect
the whole top halves of all of the output words.  The linear transformation
propagates this change to the bottom halves.  Complete avalanche is therefore
achieved after three rounds of Storin.


\subsection{Key schedule notes}

The key schedule is intended to be adequate for bulk encryption; it doesn't
provide good key agility, and isn't intended to.  The key schedule accepts up
to 28 words of user key, although expecting 672 bits of security from the
cipher is not realistic.  The suggested maximum of 5 words (120 bits) seems
more sensible.  This maximum can be raised easily when our understanding of
the cipher increases our confidence in it.

The key schedule is strongly reminiscent of Blowfish
\cite{Schneier:1994:DNV}.  Use of existing components of the cipher, such as
the matrix multiplication and the cipher itself, help reduce the amount of
code required in the implementation.

The restriction of the key schedule to 28 words is due to an interesting
property, also shared by Blowfish (see figure~\ref{fig:bfkeysched}): the
output of the first round of the second encryption doesn't depend on the
previous round.  To see why this is so, it is enough to note that the first
round key has just been set equal to what is now the plaintext; the result of
the key mixing stage is zero, which is unaffected by the matrix and linear
transformation.

A limit of 28 words is chosen to ensure that the round-1 key affects the
round-2 key in a part of the cipher earlier than the postwhitening stage.

\begin{figure}
\centering
\leavevmode
\begin{xy}
  \xycompile{
    \POS 0; <0.7cm, 0cm>:
    \POS (0, 0) ="o" +(3, 0) ="w"
    \ar "o" *+{P[0]}; p-(0, 1) *{\xor} ="x"
    \ar "x" -(1, 0) *+[l]{P[0]}; "x"
    \ar@{-} "x"; p-(0, 2) ="as"
    \ar "w" *+{P[1]}; p-(0, 2) *{\xor} ="x"
    \ar "o"-(0, 2); "x" |-*+[F]{F}
    \ar@{-} "x"; p-(0, 1) ="bs"
    \ar@{-} "as"; "bs"-(0, 1) ="w"
    \ar@{-} "bs"; "as"-(0, 1) ="o"
    \ar "o"; p-(0, 1) *+{P[1] \xor F(0)} ="x"
    \ar "x"; p-(0, 1) *{\xor} ="x"
    \ar "x" -(1, 0) *+[l]{P[1]}; "x"
    \ar "x"; p-(0, 2) *+{F(0)}
    \ar "w"; p-(0, 1) *+{0} ="x"
    \ar "x"; p-(0, 2) *{\xor} ="x"
    \ar "o"-(0, 3); "x" |-*+[F]{F}
    \ar "x"; p-(0, 1) *+{F^2(0)}}
\end{xy}
\caption{Blowfish key schedule: $P[2]$ and $P[3]$ don't depend on $P[0]$ and
  $P[1]$.}
\label{fig:bfkeysched}
\end{figure}

\subsection{Attacking Storin}

\subsubsection{Differential cryptanalysis}

There is a two-round truncated differential \cite{Wooding:2000:Storin-diff},
which can be used to break Storin reduced to only 2 rounds.  The differential
\[ \begin{pmatrix}
     1 \lsl 23 \\ 1 \lsl 23 \\ 1 \lsl 23 \\ 0
   \end{pmatrix} \to
   \begin{pmatrix}
     0 \\ 0 \\ 1 \lsl 23 \\ 0
   \end{pmatrix}
\]
holds with probability 1 through the matrix multiplication.
Differentials in the linear transform are easy to find; for example:
\[ \begin{pmatrix}
     0 \\ 0 \\ 1 \lsl 23 \\ 0
   \end{pmatrix} \to
   \begin{pmatrix}
     0 \\ 0 \\ (1 \lsl 23) \xor (1 \lsl 11) \\ 0
   \end{pmatrix}
\]
We can continue through the second round's matrix multiplication with a
truncated differential, again with probability 1:
\[ \begin{pmatrix}
     0 \\ 0 \\ (1 \lsl 23) \xor (1 \lsl 11) \\ 0
   \end{pmatrix} \to
   \begin{pmatrix}
     \delta_0 \lsl 12 \\
     (\delta_1 \lsl 12) \xor (1 \lsl 11) \\
     (\delta_2 \lsl 12) \xor (1 \lsl 11) \\
     (\delta_3 \lsl 12) \xor (1 \lsl 11) \\
   \end{pmatrix}
\]
where the $\delta_i$ are unknown 12-bit values.  Applying the linear
transformation to this output difference gives us
\[ \begin{pmatrix}
     \delta_0 \lsl 12 \\
     (\delta_1 \lsl 12) \xor (1 \lsl 11) \\
     (\delta_2 \lsl 12) \xor (1 \lsl 11) \\
     (\delta_3 \lsl 12) \xor (1 \lsl 11) \\
   \end{pmatrix} \to
   \begin{pmatrix}
     (\delta_0 \lsl 12) \xor \delta_0 \\
     (\delta_1 \lsl 12) \xor \delta_1 \xor (1 \lsl 11) \\
     (\delta_2 \lsl 12) \xor \delta_2 \xor (1 \lsl 11) \\
     (\delta_3 \lsl 12) \xor \delta_3 \xor (1 \lsl 11) \\
   \end{pmatrix}
\]
A subsequent key-mixing or postwhitening stage won't affect the difference.
We can therefore combine the differentials above to construct a probability-1
truncated differential for a 2-round variant of Storin:
\[ \begin{pmatrix}
     1 \lsl 23 \\ 1 \lsl 23 \\ 1 \lsl 23 \\ 0
   \end{pmatrix} \to
   \begin{pmatrix}
     (\delta_0 \lsl 12) \xor \delta_0 \\
     (\delta_1 \lsl 12) \xor \delta_1 \xor (1 \lsl 11) \\
     (\delta_2 \lsl 12) \xor \delta_2 \xor (1 \lsl 11) \\
     (\delta_3 \lsl 12) \xor \delta_3 \xor (1 \lsl 11) \\
   \end{pmatrix}
\]
This characteristic is non-iterative, and can't be extended to more rounds.

The differential can be converted into a key-recovery attack against $n$
rounds fairly easily, by obtaining the ciphertext for an appropriate
plaintext pair and guessing the $n - 2$ round keys, testing the guesses by
working backwards and finding out whether the expected output difference is
visible.  The attack requires a pair of chosen plaintexts, and
$O(2^{96(n - 2)})$ work.  It is only more efficient than exhaustive search
when the key is longer than $96(n - 2)$ bits.

This attack can be improved.  Consider a 3-round variant of Storin, where the
objective is to discover the postwhitening keys.  The postwhitening stage can
be commuted with the linear transform simply by applying the transform to the
postwhitening keys.  We do this, and guess the least significant 12 bits of
each of the (transformed) postwhitening key words.  Working through the
matrix multiplication mod $2^{12}$ rather than mod $2^{24}$ then gives us the
12 least significant bits of the state words on input to the matrix.  Further
key bits can then be guessed and tested, four at a time, to recover the
remaining postwhitening key bits, by ensuring that the differences in the
more significant bits of the third round matrix input obey the characteristic
described above.  This requires only about $2^{48}$ work, and may be extended
to further rounds by exhaustively searching for the extra round keys.

This attack can break Storin with $n$ rounds ($n \ge 3$) with minimal chosen
plaintext and $O(2^{48 + 96(n - 3)})$ work.  This is the best attack known
against Storin.

\subsubsection{Other attacks}

In \cite{Fisher:2000:Storin-collide}, Matthew Fisher speculates on breaking 2
rounds of Storin by forcing collisions in the matrix multiplication outputs.
This attack doesn't extend to more than two rounds either.

One possible avenue of attack worth exploring is to attempt to cause zero
words to be input into the first-round matrix by choosing plaintext words
identical to subkey words for the first round.  Causing $n$ matrix input
words to be zero clearly takes $O(2^{24n})$ time.  If a method can be found
to detect when zero words have been input to the matrix, this can be used to
discover the subkey words rather more rapidly than exhaustive search.  We
can't see a way to exploit this at the moment, but it could be a fruitful
place to look for cryptanalysis.


\section{Conclusion}

We have presented a new block cipher, Storin.  Any cryptanalysis will be
received with interest.

\bibliographystyle{alpha}
\bibliography{cryptography,mdw}

%%%----- That's all, folks --------------------------------------------------

\end{document}

%%% Local Variables: 
%%% mode: latex
%%% TeX-master: t
%%% End: