#!/usr/bin/perl # Copyright 2008 The RE2 Authors. All Rights Reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. # Generate table entries giving character ranges # for POSIX/Perl character classes. Rather than # figure out what the definition is, it is easier to ask # Perl about each letter from 0-128 and write down # its answer. @posixclasses = ( "[:alnum:]", "[:alpha:]", "[:ascii:]", "[:blank:]", "[:cntrl:]", "[:digit:]", "[:graph:]", "[:lower:]", "[:print:]", "[:punct:]", "[:space:]", "[:upper:]", "[:word:]", "[:xdigit:]", ); @perlclasses = ( "\\d", "\\s", "\\w", ); %overrides = ( # Prior to Perl 5.18, \s did not match vertical tab. # RE2 preserves that original behaviour. "\\s:11" => 0, ); sub ComputeClass($) { my ($cname) = @_; my @ranges; my $regexp = qr/[$cname]/; my $start = -1; for (my $i=0; $i<=129; $i++) { if ($i == 129) { $i = 256; } if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) { if ($start < 0) { $start = $i; } } else { if ($start >= 0) { push @ranges, [$start, $i-1]; } $start = -1; } } return @ranges; } sub PrintClass($$@) { my ($cnum, $cname, @ranges) = @_; print "static const URange16 code${cnum}[] = { /* $cname */\n"; for (my $i=0; $i<@ranges; $i++) { my @a = @{$ranges[$i]}; printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1]; } print "};\n"; my $n = @ranges; my $escname = $cname; $escname =~ s/\\/\\\\/g; $negname = $escname; if ($negname =~ /:/) { $negname =~ s/:/:^/; } else { $negname =~ y/a-z/A-Z/; } return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }"; } my $cnum = 0; sub PrintClasses($@) { my ($pname, @classes) = @_; my @entries; foreach my $cname (@classes) { my @ranges = ComputeClass($cname); push @entries, PrintClass(++$cnum, $cname, @ranges); } print "const UGroup ${pname}_groups[] = {\n"; foreach my $e (@entries) { print "\t$e,\n"; } print "};\n"; my $count = @entries; print "const int num_${pname}_groups = $count;\n"; } print <perl_groups.cc #include "re2/unicode_groups.h" namespace re2 { EOF PrintClasses("perl", @perlclasses); PrintClasses("posix", @posixclasses); print <