lib/TOML/Tiny/Tokenizer.pm

   1 package TOML::Tiny::Tokenizer;
   2 # ABSTRACT: tokenizer used by TOML::Tiny
   3
   4 use strict;
   5 use warnings;
   6 no warnings qw(experimental);
   7 use charnames qw(:full);
   8 use v5.18;
   9
  10 use TOML::Tiny::Grammar;
  11
  12 sub new {
  13   my ($class, %param) = @_;
  14
  15   my $self = bless{
  16     source        => $param{source},
  17     last_position => length $param{source},
  18     position      => 0,
  19     line          => 1,
  20     last_token    => undef,
  21   }, $class;
  22
  23   return $self;
  24 }
  25
  26 sub next_token {
  27   my $self = shift;
  28
  29   return unless defined $self->{source}
  30       && $self->{position} < $self->{last_position};
  31
  32   if (!$self->{last_token}) {
  33     return $self->{last_token} = {type => 'table', pos => 0, line => 1, value => []};
  34   }
  35
  36   # Update the regex engine's position marker in case some other regex
  37   # attempted to match against the source string and reset it.
  38   pos($self->{source}) = $self->{position};
  39
  40   my $token;
  41   my $type;
  42   my $value;
  43
  44   state $key_set     = qr/\G ($Key) $WS* (?= =)/x;
  45   state $table       = qr/\G \[ $WS* ($Key) $WS* \] $WS* (?:$EOL | $)/x;
  46   state $array_table = qr/\G \[\[ $WS* ($Key) $WS* \]\] $WS* (?:$EOL | $)/x;
  47
  48   state $simple = {
  49     '['     => 'inline_array',
  50     ']'     => 'inline_array_close',
  51     '{'     => 'inline_table',
  52     '}'     => 'inline_table_close',
  53     ','     => 'comma',
  54     '='     => 'assign',
  55     'true'  => 'bool',
  56     'false' => 'bool',
  57   };
  58
  59   # More complex matches with regexps
  60   while ($self->{position} < $self->{last_position} && !defined($type)) {
  61     my $prev = $self->{last_token} ? $self->{last_token}{type} : 'EOL';
  62     my $newline = !!($prev eq 'EOL' || $prev eq 'table' || $prev eq 'array_table');
  63
  64     for ($self->{source}) {
  65       /\G$WS+/gc;               # ignore whitespace
  66       /\G$Comment$/mgc && next;  # ignore comments
  67
  68       last when /\G$/gc;
  69
  70       when (/\G$EOL/gc) {
  71         ++$self->{line};
  72         $type = 'EOL';
  73       }
  74
  75       if ($newline) {
  76         when (/$table/gc) {
  77           $type = 'table';
  78           $value = $self->tokenize_key($1);
  79         }
  80
  81         when (/$array_table/gc) {
  82           $type = 'array_table';
  83           $value = $self->tokenize_key($1);
  84         }
  85       }
  86
  87       when (/\G ( [\[\]{}=,] | true | false )/xgc) {
  88         $value = $1;
  89         $type = $simple->{$value};
  90       }
  91
  92       when (/$key_set/gc) {
  93         $type = 'key';
  94         $value = $1;
  95       }
  96
  97       when (/\G($String)/gc) {
  98         $type = 'string';
  99         $value = $1;
 100       }
 101
 102       when (/\G($DateTime)/gc) {
 103         $type = 'datetime';
 104         $value = $1;
 105       }
 106
 107       when (/\G($Float)/gc) {
 108         $type = 'float';
 109         $value = $1;
 110       }
 111
 112       when (/\G($Integer)/gc) {
 113         $type = 'integer';
 114         $value = $1;
 115       }
 116
 117       default{
 118         my $substr = substr($self->{source}, $self->{position}, 30) // 'undef';
 119         die "toml syntax error on line $self->{line}\n\t-->|$substr|\n";
 120       }
 121     }
 122
 123     if ($type) {
 124       state $tokenizers = {};
 125       my $tokenize = $tokenizers->{$type} //= $self->can("tokenize_$type") || 0;
 126
 127       $token = $self->{last_token} = {
 128         line  => $self->{line},
 129         pos   => $self->{pos},
 130         type  => $type,
 131         value => $tokenize ? $tokenize->($self, $value) : $value,
 132       };
 133     }
 134
 135     $self->update_position;
 136   }
 137
 138   return $token;
 139 }
 140
 141 sub current_line {
 142   my $self = shift;
 143   my $rest = substr $self->{source}, $self->{position};
 144   my $stop = index $rest, "\n";
 145   substr $rest, 0, $stop;
 146 }
 147
 148 sub update_position {
 149   my $self = shift;
 150   $self->{position} = pos($self->{source}) // 0;
 151 }
 152
 153 sub error {
 154   my $self  = shift;
 155   my $token = shift;
 156   my $msg   = shift // 'unknown';
 157   my $line  = $token ? $token->{line} : $self->{line};
 158   die "toml: parse error at line $line: $msg\n";
 159 }
 160
 161 sub tokenize_key {
 162   my $self = shift;
 163   my $toml = shift;
 164   my @keys = $toml =~ /($SimpleKey)\.?/g;
 165
 166   for (@keys) {
 167     s/^["']//;
 168     s/["']$//;
 169   }
 170
 171   return \@keys;
 172 }
 173
 174 sub tokenize_float {
 175   $_[1] =~ tr/_//d;
 176   $_[1];
 177 }
 178
 179 sub tokenize_integer {
 180   $_[1] =~ tr/_+//d;
 181   $_[1];
 182 }
 183
 184 sub tokenize_string {
 185   my $self = shift;
 186   my $toml = shift;
 187   my $ml   = index($toml, q{'''}) == 0
 188           || index($toml, q{"""}) == 0;
 189   my $lit  = index($toml, q{'}) == 0;
 190   my $str  = '';
 191
 192   if ($ml) {
 193     $str = substr $toml, 3, length($toml) - 6;
 194     my @newlines = $str =~ /($CRLF)/g;
 195     $self->{line} += scalar @newlines;
 196     $str =~ s/^$WS* $EOL//x;  # trim leading whitespace
 197     $str =~ s/\\$EOL\s*//xgs; # trim newlines from lines ending in backslash
 198   } else {
 199     $str = substr($toml, 1, length($toml) - 2);
 200   }
 201
 202   if (!$lit) {
 203     $str = $self->unescape_str($str);
 204   }
 205
 206   return $str;
 207 }
 208
 209 sub unescape_chars {
 210   state $esc = {
 211     '\b'   => "\x08",
 212     '\t'   => "\x09",
 213     '\n'   => "\x0A",
 214     '\f'   => "\x0C",
 215     '\r'   => "\x0D",
 216     '\"'   => "\x22",
 217     '\/'   => "\x2F",
 218     '\\\\' => "\x5C",
 219   };
 220
 221   if (exists $esc->{$_[0]}) {
 222     return $esc->{$_[0]};
 223   }
 224
 225   my $hex = hex substr($_[0], 2);
 226
 227   if (charnames::viacode($hex)) {
 228     return chr $hex;
 229   }
 230
 231   return;
 232 }
 233
 234 sub unescape_str {
 235   state $re = qr/($Escape)/;
 236   $_[1] =~ s|$re|unescape_chars($1) // $_[0]->error(undef, "invalid unicode escape: $1")|xge;
 237   $_[1];
 238 }
 239
 240 1;