chiark - git - mdw - mLib/blob - codec/codec.3

   1 .\" -*-nroff-*-
   2 .TH codec 3 "9 January 2009" "Straylight/Edgeware" "mLib utilities library"
   3 .SH NAME
   4 codec \- binary encoding and decoding
   5 .\" @codec_class
   6 .\" @codec_strerror
   7 .\" @null_codec_class
   8 .\" @base64_class
   9 .\" @file64_class
  10 .\" @base64url_class
  11 .\" @base32_class
  12 .\" @base32hex_class
  13 .\" @hex_class
  14 .SH SYNOPSIS
  15 .nf
  16 .B "#include <mLib/codec.h>"
  17 .B "#include <mLib/base64.h>"
  18 .B "#include <mLib/base32.h>"
  19 .B "#include <mLib/hex.h>"
  20 .PP
  21 .B "#define CDCF_LOWERC ..."
  22 .B "#define CDCF_IGNCASE ..."
  23 .B "#define CDCF_NOEQPAD ..."
  24 .B "#define CDCF_IGNEQPAD ..."
  25 .B "#define CDCF_IGNEQMID ..."
  26 .B "#define CDCF_IGNZPAD ..."
  27 .B "#define CDCF_IGNNEWL ..."
  28 .B "#define CDCF_IGNINVCH ..."
  29 .B "#define CDCF_IGNSPC ..."
  30 .B "#define CDCF_IGNJUNK ..."
  31 .PP
  32 .ta 2n
  33 .B "enum {"
  34 .B "    CDCERR_OK = ...,"
  35 .B "    CDCERR_INVCH = ...,"
  36 .B "    CDCERR_INVEQPAD = ...,"
  37 .B "    CDCERR_INVZPAD = ..."
  38 .B "};"
  39 .PP
  40 .B "typedef struct {"
  41 .B "    const char *name;"
  42 .ta 2n +\w'\fBcodec *(*encoder)('u
  43 .BI "   codec *(*encoder)(unsigned " flags ,
  44 .BI "           const char *" indent ", unsigned " maxlen );
  45 .BI "   codec *(*decoder)(unsigned " flags );
  46 .B "    ...\&"
  47 .B "} codec_class;"
  48 .PP
  49 .B "typedef struct {"
  50 .B "    const codec_ops *ops;"
  51 .B "} codec;"
  52 .PP
  53 .B "typedef struct {"
  54 .B "    const codec_class *c;"
  55 .BI "   int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d );
  56 .BI "   void (*destroy)(codec *" c );
  57 .B "} codec_ops;"
  58 .PP
  59 .B "codec_class null_codec_class;"
  60 .B "codec_class base64_class, file64_class, base64url_class;"
  61 .B "codec_class base32_class, base32hex_class;"
  62 .B "codec_class hex_class;"
  63 .PP
  64 .BI "const char *codec_strerror(int " err ");"
  65 .fi
  66 .SH DESCRIPTION
  67 The
  68 .B codec
  69 system provides an object-based interface to functions which encode
  70 binary data as plain text and decode the result to recover the original
  71 binary data.  The interface makes it easy to support multiple encodings
  72 and select an appropriate one at runtime.
  73 .SS "The codec_class structure"
  74 The
  75 .B codec_class
  76 structure represents a particular encoding format.  The structure has
  77 the following members.
  78 .TP
  79 .B "const char *name"
  80 The name of the class, as a null-terminated string.  The name should not
  81 contain whitespace characters.
  82 .TP
  83 .BI "codec *(*encoder)(unsigned " flags ", const char *" indent ", unsigned " maxline ")"
  84 Pointer to a function which constructs a new encoder object, of type
  85 .BR codec .
  86 The
  87 .I flags
  88 configure the behaviour of the object; the
  89 .I indent
  90 string is written to separate lines of output; the integer
  91 .I maxline
  92 is the maximum length of line to be produced, or zero to forbid line
  93 breaking.
  94 .TP
  95 .BI "codec *(*decoder)(unsigned " flags ")"
  96 Pointer to a function which constructs a new decoder object, also of
  97 type
  98 .BR codec .
  99 The
 100 .I flags
 101 configure the behaviour of the object.
 102 .PP
 103 The
 104 .I flags
 105 to the
 106 .B encoder
 107 and
 108 .B decoder
 109 functions have the following meanings.
 110 .TP
 111 .B CDCF_LOWERC
 112 For codecs which produce output using a single alphabetic case (e.g.,
 113 .BR base32 ,
 114 .BR hex ),
 115 emit and accept only lower case; the default to emit and accept only
 116 upper case, for compatibility with RFC4648.  If the codec usually
 117 produces mixed-case output, then this flag is ignored.
 118 .TP
 119 .B CDCF_IGNCASE
 120 For codecs which produce output using a single alphabetic case, ignore
 121 the case of the input when decoding.  If the codec usually produces
 122 mixed-case output, then this flag is ignored.
 123 .TP
 124 .B CDCF_NOEQPAD
 125 For codecs which usually pad their output (e.g.,
 126 .BR base64 ,
 127 .BR base32 ),
 128 do not emit or accept padding characters.  If the codec does not usually
 129 produce padding, or the padding is not redundant, then this flag is
 130 ignored.
 131 .TP
 132 .B CDCF_IGNEQPAD
 133 For codecs which usually pad their output, do not treat incorrect (e.g.,
 134 missing or excessive) padding as an error when decoding.  If the codec
 135 does not usually produce padding, or the padding is required for
 136 unambiguous decoding, then this flag is ignored.
 137 .TP
 138 .B CDCF_IGNEQMID
 139 For codecs which usually pad their output, ignore padding characters
 140 wherever they may appear when decoding.  Usually padding characters
 141 indicate the end of the input, and further input characters are
 142 considered erroneous.  If the codec does not usually produce padding, or
 143 it is impossible to resume decoding correctly having seen padding
 144 characters, then this flag is ignored.
 145 .TP
 146 .B CDCF_IGNZPAD
 147 For codecs which need to pad their input, ignore unusual padding bits
 148 when decoding.  (This is not at all the same thing as the padding
 149 characters controlled by the flags above: they deal with padding the
 150 length of the encoding
 151 .I output
 152 up to a suitable multiple of characters; this option deals with padding
 153 of the
 154 .I input
 155 prior to encoding.)  If the codec does not add padding bits, or specific
 156 values are required for unambiguous decoding, then this flag is ignored.
 157 .TP
 158 .B CDCF_IGNNEWL
 159 Ignore newline (and carriage-return) characters when decoding: the
 160 default for RFC4648 codecs is to reject newline characters.  If these
 161 characters are significant in the encoding, then this flag is ignored.
 162 .TP
 163 .B CDCF_IGNSPC
 164 Ignore whitespace characters (other than newlines) when decoding: the
 165 default for RFC4648 codecs is to reject whitespace characters.  If these
 166 characters are significant in the encoding, then this flag is ignored.
 167 .TP
 168 .B CDCF_IGNINVCH
 169 Ignore any other invalid characters appearing in the input when
 170 decoding.
 171 .TP
 172 .B CDCF_IGNJUNK
 173 Ignore all `junk' in the input.  This should suppress almost all
 174 decoding errors.
 175 .PP
 176 If you do not set any of the
 177 .BR CDCF_IGN ...\&
 178 flags, a decoder should only accept the exact encoding that the
 179 corresponding encoder would produce (with
 180 .I maxline
 181 = 0 to inhibit line-breaking).
 182 .SS "The codec and codec_ops structures"
 183 The
 184 .B codec
 185 structure represents the state of an encoder or decoder, as returned by
 186 the
 187 .B encoder
 188 and
 189 .B decoder
 190 functions described above, contains a single member.
 191 .TP
 192 .B "const codec_ops *ops"
 193 Pointer to a
 194 .B codec_ops
 195 structure which contains operations and metadata for use with the
 196 encoder or decoder.
 197 .PP
 198 The
 199 .B codec_ops
 200 structure contains the following members.
 201 .TP
 202 .B "const codec_class *c"
 203 Pointer back to the
 204 .B codec_class
 205 which was used to construct the
 206 .B codec
 207 object.
 208 .TP
 209 .BI "int (*code)(codec *" c ", const void *" p ", size_t " sz ", dstr *" d ")"
 210 Encode or decode, using the codec
 211 .IR c ,
 212 the data in the buffer at address
 213 .I p
 214 and continuing for
 215 .I sz
 216 bytes, appending the output to the dynamic string
 217 .I d
 218 (see
 219 .BR dstr (3)).
 220 If the operation was successful, the function returns zero; otherwise it
 221 returns a nonzero error code, as described below.
 222 .TP
 223 .BI "void (*destroy)(codec *" c ")"
 224 Destroy the codec object
 225 .IR c ,
 226 freeing any resources it may hold.
 227 .PP
 228 A codec may buffer its input (e.g., if needs to see more in order to
 229 decide what output to produce next); it may also need to take special
 230 action at the end of the input (e.g., flushing buffers, and applying
 231 padding).  To signal the codec that there is no more input, call the
 232 .B code
 233 function with a null
 234 .I p
 235 pointer.  It will then write any final output to
 236 .IR d .
 237 .PP
 238 The following error conditions may be reported.
 239 .TP
 240 .B CDCERR_INVCH
 241 An invalid character was encountered while decoding.  This includes
 242 encoutering padding characters if padding is disabled using the
 243 .B CDCF_NOEQPAD
 244 flag.
 245 .TP
 246 .B CDCERR_INVEQPAD
 247 Invalid padding characters (e.g., wrong characters, or too few, too
 248 many, or none at all) were found during decoding.  This may also
 249 indicate that the input is truncated, even if the codec does not usually
 250 perform output padding.
 251 .TP
 252 .B CDCERR_INVZPAD
 253 Invalid padding bits were found during decoding.
 254 .PP
 255 The
 256 .B codec_strerror
 257 function converts these error codes to brief, (moderately)
 258 human-readable strings.
 259 .SS "Provided codecs"
 260 The library provides a number of standard codecs.
 261 .TP
 262 .B base64
 263 Implements Base64 encoding, as defined by RFC4648.  Output is
 264 mixed-case, so the
 265 .B CDCF_LOWERC
 266 and
 267 .B CDCF_IGNCASE
 268 flags are ignored.
 269 .TP
 270 .B safe64
 271 Implements a variant of the Base64 encoding which uses
 272 .RB ` % '
 273 in place of
 274 .RB ` / ',
 275 so that its output is suitable for use as a Unix filename.
 276 .TP
 277 .B base64url
 278 Implements the filename- and URL-safe variant of Base64 encoding, as
 279 defined by RFC4648.
 280 .TP
 281 .B base32
 282 Implements Base32 encoding, as defined by RFC4648.  Output is in upper
 283 case by default.
 284 .TP
 285 .B base32hex
 286 Implements the extended-hex variant of Base32, as defined by RFC4648.
 287 This encoding has the property that the encoding preserves the ordering
 288 of messages if padding is suppressed.
 289 .TP
 290 .B hex
 291 Implements hex encoding, defined by RFC4648 under the name Base16.  For
 292 compatibility with that specification, output is in upper case by
 293 default.
 294 .SH "SEE ALSO"
 295 .BR bincode (1),
 296 .BR dstr (3),
 297 .BR mLib (3).
 298 .SH AUTHOR
 299 Mark Wooding, <mdw@distorted.org.uk>