lib/TOML/Tiny/Tokenizer.pm

   1 package TOML::Tiny::Tokenizer;
   2 # ABSTRACT: tokenizer used by TOML::Tiny
   3
   4 use strict;
   5 use warnings;
   6 no warnings qw(experimental);
   7 use charnames qw(:full);
   8 use v5.18;
   9
  10 use TOML::Tiny::Grammar;
  11
  12 sub new {
  13   my ($class, %param) = @_;
  14
  15   my $self = bless{
  16     source        => $param{source},
  17     last_position => length $param{source},
  18     position      => 0,
  19     line          => 1,
  20     tokens        => [],
  21   }, $class;
  22
  23   return $self;
  24 }
  25
  26 sub prev_token_type {
  27   my $self = shift;
  28
  29   if (@{$self->{tokens}}) {
  30     return $self->{tokens}[-1]{type} // 'EOL';
  31   }
  32
  33   return 'EOL';
  34 }
  35
  36 sub next_token {
  37   my $self = shift;
  38
  39   return unless defined $self->{source}
  40       && $self->{position} < $self->{last_position};
  41
  42   if (!@{ $self->{tokens} }) {
  43     my $root = {type => 'table', pos => 0, line => 1, value => []};
  44     $self->push_token($root);
  45     return $root;
  46   }
  47
  48   # Update the regex engine's position marker in case some other regex
  49   # attempted to match against the source string and reset it.
  50   pos($self->{source}) = $self->{position};
  51
  52   my $token;
  53   my $type;
  54   my $value;
  55
  56   state $key_set     = qr/\G ($Key) $WS* (?= =)/x;
  57   state $table       = qr/\G \[ $WS* ($Key) $WS* \] $WS* (?:$EOL | $)/x;
  58   state $array_table = qr/\G \[\[ $WS* ($Key) $WS* \]\] $WS* (?:$EOL | $)/x;
  59
  60   state $simple = {
  61     '['     => 'inline_array',
  62     ']'     => 'inline_array_close',
  63     '{'     => 'inline_table',
  64     '}'     => 'inline_table_close',
  65     ','     => 'comma',
  66     '='     => 'assign',
  67     'true'  => 'bool',
  68     'false' => 'bool',
  69   };
  70
  71   # More complex matches with regexps
  72   while ($self->{position} < $self->{last_position} && !defined($type)) {
  73     my $prev = $self->prev_token_type;
  74     my $newline = !!($prev eq 'EOL' || $prev eq 'table' || $prev eq 'array_table');
  75
  76     for ($self->{source}) {
  77       /\G$WS+/gc;               # ignore whitespace
  78       /\G$Comment$/mgc && next;  # ignore comments
  79
  80       last when /\G$/gc;
  81
  82       when (/\G$EOL/gc) {
  83         ++$self->{line};
  84         $type = 'EOL';
  85       }
  86
  87       if ($newline) {
  88         when (/$table/gc) {
  89           $type = 'table';
  90           $value = $self->tokenize_key($1);
  91         }
  92
  93         when (/$array_table/gc) {
  94           $type = 'array_table';
  95           $value = $self->tokenize_key($1);
  96         }
  97       }
  98
  99       when (/\G ( [\[\]{}=,] | true | false )/xgc) {
 100         $value = $1;
 101         $type = $simple->{$value};
 102       }
 103
 104       when (/$key_set/gc) {
 105         $type = 'key';
 106         $value = $1;
 107       }
 108
 109       when (/\G($String)/gc) {
 110         $type = 'string';
 111         $value = $1;
 112       }
 113
 114       when (/\G($DateTime)/gc) {
 115         $type = 'datetime';
 116         $value = $1;
 117       }
 118
 119       when (/\G($Float)/gc) {
 120         $type = 'float';
 121         $value = $1;
 122       }
 123
 124       when (/\G($Integer)/gc) {
 125         $type = 'integer';
 126         $value = $1;
 127       }
 128
 129       default{
 130         my $substr = substr($self->{source}, $self->{position}, 30) // 'undef';
 131         die "toml syntax error on line $self->{line}\n\t-->|$substr|\n";
 132       }
 133     }
 134
 135     if ($type) {
 136       $token = {
 137         line  => $self->{line},
 138         pos   => $self->{pos},
 139         type  => $type,
 140         value => $self->can("tokenize_$type") ? $self->can("tokenize_$type")->($self, $value) : $value,
 141       };
 142
 143       $self->push_token($token);
 144     }
 145
 146     $self->update_position;
 147   }
 148
 149   return $token;
 150 }
 151
 152 sub push_token {
 153   my $self = shift;
 154   @_ && push @{$self->{tokens}}, @_;
 155 }
 156
 157 sub pop_token {
 158   my $self = shift;
 159   pop @{$self->{tokens}};
 160 }
 161
 162 sub current_line {
 163   my $self = shift;
 164   my $rest = substr $self->{source}, $self->{position};
 165   my $stop = index $rest, "\n";
 166   substr $rest, 0, $stop;
 167 }
 168
 169 sub update_position {
 170   my $self = shift;
 171   $self->{position} = pos($self->{source}) // 0;
 172 }
 173
 174 sub error {
 175   my $self  = shift;
 176   my $token = shift;
 177   my $msg   = shift // 'unknown';
 178   my $line  = $token ? $token->{line} : $self->{line};
 179   die "toml: parse error at line $line: $msg\n";
 180 }
 181
 182 sub tokenize_key {
 183   my $self = shift;
 184   my $toml = shift;
 185   my @keys;
 186
 187   while ($toml =~ s/^ ($SimpleKey) [.]?//x) {
 188     push @keys, $1;
 189   }
 190
 191   for (@keys) {
 192     s/^["']//;
 193     s/["']$//;
 194   }
 195
 196   return \@keys;
 197 }
 198
 199 sub tokenize_float {
 200   $_[1] =~ tr/_//d;
 201   $_[1];
 202 }
 203
 204 sub tokenize_integer {
 205   $_[1] =~ tr/_+//d;
 206   $_[1];
 207 }
 208
 209 sub tokenize_string {
 210   my $self = shift;
 211   my $toml = shift;
 212   my $ml   = $toml =~ /^(?:''')|(?:""")/;
 213   my $lit  = $toml =~ /^'/;
 214   my $str  = '';
 215
 216   if ($ml) {
 217     $str = substr $toml, 3, length($toml) - 6;
 218     my @newlines = $str =~ /($CRLF)/g;
 219     $self->{line} += scalar @newlines;
 220     $str =~ s/^$WS* $EOL//x;  # trim leading whitespace
 221     $str =~ s/\\$EOL\s*//xgs; # trim newlines from lines ending in backslash
 222   } else {
 223     $str = substr($toml, 1, length($toml) - 2);
 224   }
 225
 226   if (!$lit) {
 227     $str = $self->unescape_str($str);
 228   }
 229
 230   return ''.$str;
 231 }
 232
 233 sub unescape_chars {
 234   state $esc = {
 235     '\b'   => "\x08",
 236     '\t'   => "\x09",
 237     '\n'   => "\x0A",
 238     '\f'   => "\x0C",
 239     '\r'   => "\x0D",
 240     '\"'   => "\x22",
 241     '\/'   => "\x2F",
 242     '\\\\' => "\x5C",
 243   };
 244
 245   if (exists $esc->{$_[0]}) {
 246     return $esc->{$_[0]};
 247   }
 248
 249   my $hex = hex substr($_[0], 2);
 250
 251   if (charnames::viacode($hex)) {
 252     return chr $hex;
 253   }
 254
 255   return;
 256 }
 257
 258 sub unescape_str {
 259   state $re = qr/($Escape)/;
 260   $_[1] =~ s|$re|unescape_chars($1) // $_[0]->error(undef, "invalid unicode escape: $1")|xge;
 261   $_[1];
 262 }
 263
 264 1;