[catacomb] / symm / multigen

#! @PYTHON@
###
### Generate files by filling in simple templates
###
### (c) 2013 Straylight/Edgeware
###

###----- Licensing notice ---------------------------------------------------
###
### This file is part of Catacomb.
###
### Catacomb is free software; you can redistribute it and/or modify
### it under the terms of the GNU Library General Public License as
### published by the Free Software Foundation; either version 2 of the
### License, or (at your option) any later version.
###
### Catacomb is distributed in the hope that it will be useful,
### but WITHOUT ANY WARRANTY; without even the implied warranty of
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
### GNU Library General Public License for more details.
###
### You should have received a copy of the GNU Library General Public
### License along with Catacomb; if not, write to the Free
### Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
### MA 02111-1307, USA.

from __future__ import with_statement

import itertools as IT
import optparse as OP
import os as OS
import re as RX
from cStringIO import StringIO
from sys import argv, exit, stderr

###--------------------------------------------------------------------------
### Utilities.

QUIS = OS.path.basename(argv[0])        # Program name, for use in errors.

def die(msg):
  """Report MSG as a fatal error, and exit."""
  stderr.write('%s: %s\n' % (QUIS, msg))
  exit(1)

def indexed(seq):
  """
  Generate pairs (I, X), where I counts from zero and X are the items of SEQ.
  """
  return IT.izip(IT.count(), seq)

###--------------------------------------------------------------------------
### Reading the input values.

## Map column names to (Relation, # index) pairs.
COLMAP = {}

class Cursor (object):
  """
  A Cursor object keeps track of an iteration through a Relation.

  At any time, the Cursor has a `current' row; the individual cells of this
  row may be retrieved using Python's standard indexing operator.  The `step'
  method advances to the next row (if there is one).  The `reset' method
  returns to row zero.
  """

  def __init__(me, rel):
    """
    Initialize a new Cursor object, tracking its way through a Relation REL.

    The new Cursor has row zero as its current row.  The REL must not be
    empty.
    """
    me._rel = rel
    me.reset()

  def step(me):
    """
    Advance the Cursor to the next row.

    Returns False if there is no next row; otherwise True.
    """
    me._i += 1
    if me._i >= len(me._rel):
      me._i = me._row = None
      return False
    me._row = me._rel[me._i]
    return True

  def reset(me):
    """
    Reset the Cursor, so that row zero is current again.
    """
    me._i = 0
    me._row = me._rel[0]

  def __getitem__(me, i):
    """
    Return the item in column I of the Cursor's current row.

    The index must be acceptable to the underlying row object, but otherwise
    the Cursor imposes no restrictions.  Indices need not be numeric, for
    example.
    """
    return me._row[i]

  def __repr__(me):
    """
    Return a text description of the Cursor, for diagnostic use.
    """
    return '#<Cursor %r[%d] = %r>' % (me._rel, me._i, me._row)

class CursorSet (object):
  """
  A CursorSet iterates over the cartiesian product of a number of Relations.

  More precisely: it maintains a stack, each level of which tracks a number
  of Relations.  More Relations can be pushed onto this stack with the `push'
  method, and removed with `pop'.  The `step' method advances through the
  cartesian product of the Relations in the top level of the stack -- the
  `active' Relations.  Columns from the current rows of all of the currently
  known Relations -- whether active or not -- can be extracted using `get'.
  """

  def __init__(me):
    """
    Initialize a new CursorSet object.

    A new CursorSet has an empty stack.
    """
    me._map = {}
    me._stack = []
    me._act = None

  def push(me, rels):
    """
    Push the new Relations RELS onto the stack and start iterating.

    The currently active Relations are pushed down.  Those Relations which are
    not already known to the CursorSet become the newly active collection.
    (Relations which are already known are simply ignored.)

    Iteration traverses Relations on the right more rapidly.
    """
    cc = []
    rr = []
    for r in rels:
      if r in me._map: continue
      c = me._map[r] = Cursor(r)
      rr.append(r)
      cc.append(c)
    me._stack.append((me._act, rr))
    me._act = cc

  def step(me):
    """
    Advance the CursorSet through the currently active Relations.

    Return False if the active Relations have now been exhausted; otherwise
    return True.
    """
    i = 0
    while i < len(me._act):
      if me._act[i].step(): return True
      if i >= len(me._act): return False
      me._act[i].reset()
      i += 1
    return False

  def pop(me):
    """
    Pop the active Relations.

    Return to iterating over the previously active collection.
    """
    me._act, rels = me._stack.pop()
    for r in rels: del me._map[r]

  def get(me, rel, i):
    """
    Return the item with index I in the current row of Relation REL.
    """
    return me._map[rel][i]

class Relation (object):
  """
  A Relation keeps track of a table of data.

  A Relation consists of a `header', which is a sequence of string names,
  and a rectangular array of data, each row of which has the same number of
  items as the header.

  Relations can be iterated over using Cursors and CursorSets.
  """

  def __init__(me, head):
    """
    Initialize a new, empty Relation with header HEAD.

    The `COLMAP' dictionary is updated to map the names in the header to this
    Relation and its column indices.
    """
    me._head = head
    me._rows = []
    for i, c in indexed(head): COLMAP[c] = me, i

  def addrow(me, row):
    """
    Add a ROW to the Relation.

    The new row must have the correct number of entries.
    """
    if len(row) != len(me._head):
      die("mismatch: row `%s' doesn't match heading `%s'" %
          (', '.join(row), ', '.join(me._head)))
    me._rows.append(row)

  def __len__(me):
    """Return the number of rows in the Relation."""
    return len(me._rows)

  def __getitem__(me, i):
    """Return the Ith row of the Relation."""
    return me._rows[i]

  def __repr__(me):
    """Return a textual description of the Relation, for diagnostic use."""
    return '#<Relation %r>' % me._head

def read_immediate(word):
  """
  Return a Relation constructed by parsing WORD.

  The WORD has the form `HEAD=ROW ROW ...', where the HEAD and ROWs are
  comma-separated lists of strings which will form the relation's header and
  rows respectively.  There is no way to include an item which contains a
  comma or whitespace.
  """
  head, rels = word.split('=', 1)
  rel = Relation([c.strip() for c in head.split(',')])
  for row in rels.split(): rel.addrow([c.strip() for c in row.split(',')])

def read_file(spec):
  """
  Return a Relation constructed from a file, according to SPEC.

  The SPEC has the form `FILE:HEAD', where FILE names a file, and HEAD is a
  comma-separated list of strings to form the relation's header.  Each line
  from the file which is neither empty nor begins with `#' is split into
  whitespace-separated words to form a row in the relation.  There is no way
  to include an item which contains whitespace.
  """
  file, head = spec.split(':', 1)
  rel = Relation([c.strip() for c in head.split(',')])
  with open(file) as f:
    for line in f:
      line = line.strip()
      if line.startswith('#') or line == '': continue
      rel.addrow(line.split())

def read_thing(spec):
  """
  Return a relation constructed from SPEC.

  If SPEC begins with `@' then read the relation from a file (see
  `read_file'); otherwise interpret it as immediate data (see
  `read_immediate').
  """
  if spec.startswith('@'): read_file(spec[1:])
  else: read_immediate(spec)

###--------------------------------------------------------------------------
### Template structure.

class BasicTemplate (object):
  """
  Base class for template objects.

  The protocol for templates consists of two methods:

  relations()           Return a set of Relations mentioned at top-level in
                        substitutions in the template.

  subst(OUT, CS)        Fill in the template, writing the output to the
                        stream OUT.  The CS is a CursorSet object tracking
                        the current iteration state.
  """
  pass

class LiteralTemplate (BasicTemplate):
  """
  A LiteralTemplate outputs a fixed string.
  """

  def __init__(me, text, **kw):
    """
    Initialize a new LiteralTemplate object.  TEXT is the text to be written.
    """
    super(LiteralTemplate, me).__init__(**kw)
    me._text = text

  def relations(me):
    """A LiteralTemplate contains no substitutions."""
    return set()

  def subst(me, out, cs):
    """A LiteralTemplate just emits its text."""
    out.write(me._text)

  def __repr__(me):
    return '#<LiteralTemplate %r>' % me._text

class TagTemplate (BasicTemplate):
  """
  A TagTemplate object expands a substitution tag.

  It extracts an item from the current row of a relation, processes it
  according to an operation, and outputs the result.
  """

  def __init__(me, rel, i, op, **kw):
    """
    Initialize a new TagTemplate object.

    REL is the relation from which to pick the output; I is the column index;
    OP is a transformation to apply to the data, and may be None to indicate
    that the data should not be transformed.
    """
    super(TagTemplate, me).__init__(**kw)
    me._rel = rel
    me._i = i
    me._op = op

  def relations(me):
    """The TagTemplate knows which relation it uses."""
    return set([me._rel])

  def subst(me, out, cs):
    """
    A TagTemplate extracts and transforms an item from the current row of
    a relation.
    """
    val = cs.get(me._rel, me._i)
    if me._op is not None: val = me._op(val)
    out.write(val)

  def __repr__(me):
    return '#<TagTemplate %s>' % me._rel._head[me._i]

class SequenceTemplate (BasicTemplate):
  """
  A SequenceTemplate concatenates a number of other templates.
  """

  def __new__(cls, seq, **kw):
    """
    Construct a template from a sequence SEQ of other templates.

    If SEQ is a singleton (which it often is) then return it directly;
    otherwise construct a SequenceTemplate.
    """
    if len(seq) == 1:
      return seq[0]
    else:
      return super(SequenceTemplate, cls).__new__(cls, seq = seq, **kw)

  def __init__(me, seq, **kw):
    """
    Initialize a new SequenceTemplate object from SEQ.

    The sequence is flattened out: if SEQ contains SequenceTemplates then we
    use their children directly, so that we don't have a useless tree.
    """
    super(SequenceTemplate, me).__init__(**kw)
    tt = []
    cls = type(me)
    for t in seq:
      if isinstance(t, cls): tt += t._seq
      else: tt.append(t)
    me._seq = tt

  def relations(me):
    """
    The relations of a SequenceTemplate are the union of the relations of its
    children.
    """
    rr = set()
    for t in me._seq: rr.update(t.relations())
    return rr

  def subst(me, out, cs):
    """
    The output of a SequenceTemplate is the concatenation of the expansions
    of its children.
    """
    for t in me._seq: t.subst(out, cs)

  def __repr__(me):
    return '#<SequenceTemplate %r>' % me._seq

class RepeatTemplate (BasicTemplate):
  """
  A RepeatTemplate iterates its body over a number of relations.
  """

  def __init__(me, sub):
    """
    Initialize a new RepeatTemplate, given a template to act as its body.
    """
    me._sub = sub

  def relations(me):
    """
    A RepeatTemplate hides the relations of its body.
    """
    return set()

  def subst(me, out, cs):
    """
    Substitute a RepeatTemplate, by iterating over the relations mentioned in
    its body template.
    """
    rr = me._sub.relations()
    for r in rr:
      if len(r) == 0: return
    cs.push(rr)
    while True:
      me._sub.subst(out, cs)
      if not cs.step(): break
    cs.pop()

  def __repr__(me):
    return '#<RepeatTemplate %r>' % me._sub

###--------------------------------------------------------------------------
### Some slightly cheesy parsing machinery.

class ParseState (object):
  """
  A ParseState object keeps track of a parser's position in a file.

  The `curr' slot contains the current line under consideration.
  """

  def __init__(me, file, text):
    """
    Initialize a ParseState object.

    The FILE is a string naming the source file, and the TEXT is an iterator
    over the file's lines.
    """
    me._file = file
    me._i = 0
    me._it = iter(text.splitlines(True))
    me.step()

  def step(me):
    """
    Advance the ParseState to the next line.

    Sets `curr' to the next line, or to None if the input is exhausted.
    """
    try: me.curr = me._it.next()
    except StopIteration: me.curr = None
    else: me._i += 1

  def error(me, msg):
    """
    Report a fatal error during parsing, attributing it to the current line.
    """
    die('%s:%d: %s' % (me._file, me._i, msg))

class token (object):
  """
  A token object has no interesting properties other than its identity.
  """

  def __init__(me, name):
    """Initialize a new token, with the given NAME."""
    me._name = name
  def __repr__(me):
    """Return a description of the token, for diagnostic purposes."""
    return '#<%s>' % me._name

## Some magical tokens useful during parsing.
EOF = token('eof')
END = token('end')

## Regular expressions matching substitution tags.
R_SIMPLETAG = RX.compile(r'@ (\w+)', RX.VERBOSE)
R_COMPLEXTAG = RX.compile(r'@ { (\w+) ((?: : \w+)*) }', RX.VERBOSE)

## A dictionary mapping operation names to functions which implement them.
OPMAP = {}

def defop(func):
  """
  Decorator for substitution operator functions.

  Remember the operator in `OPMAP'; the operator's name is taken from FUNC's
  name, removing a prefix `op_' if there is one.

  An operator function is given the raw value as an argument and should
  return the transformed value.
  """
  name = func.func_name
  if name.startswith('op_'): name = name[3:]
  OPMAP[name] = func
  return func

@defop
def op_u(val):
  """@{COLUMN:u} -- the item in upper case."""
  return val.upper()

@defop
def op_l(val):
  """@{COLUMN:l} -- the item in upper case."""
  return val.lower()

R_NOTIDENT = RX.compile(r'[^a-zA-Z0-9_]+')
@defop
def op_c(val):
  """
  @{COLUMN:c} -- the item, with non-alphanumeric sequences replaced with `_'.
  """
  return R_NOTIDENT.sub('_', val)

def _pairify(val):
  """
  Split VAL into two, at an `=' sign.

  If VAL has the form `THIS=THAT' then return the pair (THIS, THAT);
  otherwise return (VAL, VAL).
  """
  c = val.find('=')
  if c >= 0: return val[:c], val[c + 1:]
  else: return val, val

@defop
def op_left(val):
  """@{COLUMN:left} -- the left-hand side of the item."""
  return _pairify(val)[0]
@defop
def op_right(val):
  """@{COLUMN:right} -- the left-hand side of the item."""
  return _pairify(val)[1]

def parse_text(ps):
  """
  Parse a chunk of text from a ParseState.

  Stop when we get to something which looks like a template keyword, but
  extract tags.  Return the resulting template.

  Tags have the form `@COLUMN', or `@{COLUMN:OPERATOR:...}'.  The text may
  contain comments beginning `%#', which are ignored, and lines beginning
  `%%' which have the initial `%' removed and are otherwise treated as normal
  text (and, in particular, may contain tags).  Other lines beginning with
  `%' are directives and must be processed by our caller.
  """

  ## Starting out: no templates collected, and an empty buffer of literal
  ## text.
  tt = []
  lit = StringIO()

  def spill():
    ## Spill accumulated literal text from `lit' into a LiteralTemplate
    ## object.
    l = lit.getvalue()
    if l: tt.append(LiteralTemplate(l))
    lit.reset()
    lit.truncate()

  ## Iterate over the lines of input.
  while True:
    line = ps.curr

    ## Stop if there's no more text; handle lines beginning with `%'.
    if line is None: break
    elif line.startswith('%'):
      if line.startswith('%#'): ps.step(); continue
      elif line.startswith('%%'): line = line[1:]
      else: break

    ## Work through the line, finding tags.
    i = 0
    while True:

      ## If there are no more `@' signs, there can be no more tags, and we're
      ## done.
      j = line.find('@', i)
      if j < 0: break

      ## Write the chunk we've found.
      lit.write(line[i:j])

      ## If the next character is also `@' then this is an escape and we
      ## should carry on.
      if line[j:].startswith('@@'):
        lit.write('@')
        i = j + 2
        continue

      ## Parse the tag into a column name, and maybe some operators.
      m = R_SIMPLETAG.match(line, j)
      if not m: m = R_COMPLEXTAG.match(line, j)
      if not m: ps.error('invalid tag')
      col = m.group(1)
      try: rel, i = COLMAP[col]
      except KeyError: ps.error("unknown column `%s'" % col)
      ops = m.lastindex >= 2 and m.group(2)

      ## If we have operators then look them up and compose them.
      wholeop = None
      if ops:
        for opname in ops[1:].split(':'):
          try: op = OPMAP[opname]
          except KeyError: ps.error("unknown operation `%s'" % opname)
          if wholeop is None: wholeop = op
          else: wholeop = (lambda f, g: lambda x: f(g(x)))(op, wholeop)

      ## Emit a LiteralTemplate for the accumulated text, and a TagTemplate
      ## for the tag.
      spill()
      tt.append(TagTemplate(rel, i, wholeop))

      ## Continue from after the tag.
      i = m.end()

    ## Finished a line.  Write out the remainder of the line and move onto
    ## the next.
    lit.write(line[i:])
    ps.step()

  ## Run out of things to do.  Flush out the rest of the literal text and
  ## combine the templates.
  spill()
  return SequenceTemplate(tt)

## A dictionary mapping regular expressions to directive-processing functions.
DIRECT = []

def direct(rx):
  """
  Function decorator for template file directives.

  Associate the regular expression RX with the function in `DIRECT'.
  Directive functions are invoked as FUNC(PS, M), where PS is the ParseState,
  and M is the match object resulting from matching RX against the directive
  text.
  """
  def _(func):
    DIRECT.append((RX.compile(rx, RX.VERBOSE), func))
    return func
  return _

def parse_template(ps):
  """
  Parse a single template from the ParseState PS.

  A single template is either a chunk of text (parsed by `parse_text') or a
  directive (handled by the appropriate function in `DIRECT').

  Returns either a template object, or a special token.  In particular, `EOF'
  is returned if we run out of text; directives may return other tokens.
  """

  ## Skip initial comments.  Otherwise we might end up with an empty
  ## SequenceTemplate here.
  while ps.curr is not None and ps.curr.startswith('%#'): ps.step()

  ## If we've run out of input, return `EOF' here.  A line beginning `%%', or
  ## not beginning `%', means we've found a chunk of text.  Otherwise find
  ## the right directive handler.
  if ps.curr is None: return EOF
  elif ps.curr.startswith('%'):
    if ps.curr.startswith('%%'): return parse_text(ps)
    for rx, func in DIRECT:
      line = ps.curr[1:].strip()
      m = rx.match(line)
      if m:
        ps.step()
        return func(ps, m)
    ps.error("unrecognized directive")
  else:
    return parse_text(ps)

def parse_templseq(ps, nestp):
  """
  Parse a sequence of templates from the ParseState PS.

  Calls `parse_template' repeatedly  If NESTP is true, then an `END' token
  (presumably from a directive handler) is permitted and halts parsing;
  otherwise `END' signifies an error.

  Returns a template object.
  """

  tt = []
  while True:
    t = parse_template(ps)
    if t is END:
      if nestp: break
      else: ps.error("unexpected `end' directive")
    elif t is EOF:
      if nestp: ps.error("unexpected end of file")
      else: break
    tt.append(t)
  return SequenceTemplate(tt)

@direct(r'repeat')
def dir_repeat(ps, m):
  """
  %repeat
  BODY
  %end

  Iterate the body over the cartesian product of the relations mentioned
  within.
  """
  return RepeatTemplate(parse_templseq(ps, True))

@direct(r'end')
def dir_end(ps, m):
  """%end -- an end marker used to delimet chunks of template."""
  return END

def compile_template(file, text):
  """
  Compile TEXT into a template, attributing errors to FILE.
  """
  ps = ParseState(file, text)
  t = parse_templseq(ps, False)
  return t

###--------------------------------------------------------------------------
### Main code.

op = OP.OptionParser(
  description = 'Generates files by filling in simple templates',
  usage = 'usage: %prog {-l | -g TMPL} FILE [COL,...=VAL,... ... | @FILE:COL,...] ...',
  version = 'Catacomb version @VERSION@')
def cb_gen(opt, optstr, arg, op):
  op.values.input = arg
  op.values.mode = 'gen'
for short, long, kw in [
  ('-l', '--list', dict(
      action = 'store_const', const = 'list', dest = 'mode',
      help = 'list filenames generated')),
  ('-g', '--generate', dict(
      action = 'callback', metavar = 'TEMPLATE',
      callback = cb_gen, type = 'string',
      help = 'generate file(s) from TEMPLATE file'))]:
  op.add_option(short, long, **kw)
op.set_defaults(mode = 'what?')
opts, args = op.parse_args()

if len(args) < 1: op.error('missing FILE')
filepat = args[0]
for rel in args[1:]: read_thing(rel)
filetempl = compile_template('<output>', filepat)

def filenames(filetempl):
  """
  Generate the filenames in the compiled filename template FILETEMPL.
  """
  cs = CursorSet()
  rr = filetempl.relations()
  for r in rr:
    if not len(r): return
  cs.push(rr)
  while True:
    out = StringIO()
    filetempl.subst(out, cs)
    yield out.getvalue(), cs
    if not cs.step(): break
  cs.pop()

## Main dispatch.
if opts.mode == 'list':
  for file, cs in filenames(filetempl): print file
elif opts.mode == 'gen':
  with open(opts.input) as f:
    templ = RepeatTemplate(compile_template(opts.input, f.read()))
  for file, cs in filenames(filetempl):
    new = file + '.new'
    with open(new, 'w') as out:
      templ.subst(out, cs)
    OS.rename(new, file)
else:
  die('What am I doing here?')

###----- That's all, folks --------------------------------------------------