xref: /aosp_15_r20/external/regex-re2/re2/make_perl_groups.pl (revision ccdc9c3e24c519bfa4832a66aa2e83a52c19f295)
1*ccdc9c3eSSadaf Ebrahimi#!/usr/bin/perl
2*ccdc9c3eSSadaf Ebrahimi# Copyright 2008 The RE2 Authors.  All Rights Reserved.
3*ccdc9c3eSSadaf Ebrahimi# Use of this source code is governed by a BSD-style
4*ccdc9c3eSSadaf Ebrahimi# license that can be found in the LICENSE file.
5*ccdc9c3eSSadaf Ebrahimi
6*ccdc9c3eSSadaf Ebrahimi# Generate table entries giving character ranges
7*ccdc9c3eSSadaf Ebrahimi# for POSIX/Perl character classes.  Rather than
8*ccdc9c3eSSadaf Ebrahimi# figure out what the definition is, it is easier to ask
9*ccdc9c3eSSadaf Ebrahimi# Perl about each letter from 0-128 and write down
10*ccdc9c3eSSadaf Ebrahimi# its answer.
11*ccdc9c3eSSadaf Ebrahimi
12*ccdc9c3eSSadaf Ebrahimi@posixclasses = (
13*ccdc9c3eSSadaf Ebrahimi	"[:alnum:]",
14*ccdc9c3eSSadaf Ebrahimi	"[:alpha:]",
15*ccdc9c3eSSadaf Ebrahimi	"[:ascii:]",
16*ccdc9c3eSSadaf Ebrahimi	"[:blank:]",
17*ccdc9c3eSSadaf Ebrahimi	"[:cntrl:]",
18*ccdc9c3eSSadaf Ebrahimi	"[:digit:]",
19*ccdc9c3eSSadaf Ebrahimi	"[:graph:]",
20*ccdc9c3eSSadaf Ebrahimi	"[:lower:]",
21*ccdc9c3eSSadaf Ebrahimi	"[:print:]",
22*ccdc9c3eSSadaf Ebrahimi	"[:punct:]",
23*ccdc9c3eSSadaf Ebrahimi	"[:space:]",
24*ccdc9c3eSSadaf Ebrahimi	"[:upper:]",
25*ccdc9c3eSSadaf Ebrahimi	"[:word:]",
26*ccdc9c3eSSadaf Ebrahimi	"[:xdigit:]",
27*ccdc9c3eSSadaf Ebrahimi);
28*ccdc9c3eSSadaf Ebrahimi
29*ccdc9c3eSSadaf Ebrahimi@perlclasses = (
30*ccdc9c3eSSadaf Ebrahimi	"\\d",
31*ccdc9c3eSSadaf Ebrahimi	"\\s",
32*ccdc9c3eSSadaf Ebrahimi	"\\w",
33*ccdc9c3eSSadaf Ebrahimi);
34*ccdc9c3eSSadaf Ebrahimi
35*ccdc9c3eSSadaf Ebrahimi%overrides = (
36*ccdc9c3eSSadaf Ebrahimi	# Prior to Perl 5.18, \s did not match vertical tab.
37*ccdc9c3eSSadaf Ebrahimi	# RE2 preserves that original behaviour.
38*ccdc9c3eSSadaf Ebrahimi	"\\s:11" => 0,
39*ccdc9c3eSSadaf Ebrahimi);
40*ccdc9c3eSSadaf Ebrahimi
41*ccdc9c3eSSadaf Ebrahimisub ComputeClass($) {
42*ccdc9c3eSSadaf Ebrahimi  my ($cname) = @_;
43*ccdc9c3eSSadaf Ebrahimi  my @ranges;
44*ccdc9c3eSSadaf Ebrahimi  my $regexp = qr/[$cname]/;
45*ccdc9c3eSSadaf Ebrahimi  my $start = -1;
46*ccdc9c3eSSadaf Ebrahimi  for (my $i=0; $i<=129; $i++) {
47*ccdc9c3eSSadaf Ebrahimi    if ($i == 129) { $i = 256; }
48*ccdc9c3eSSadaf Ebrahimi    if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) {
49*ccdc9c3eSSadaf Ebrahimi      if ($start < 0) {
50*ccdc9c3eSSadaf Ebrahimi        $start = $i;
51*ccdc9c3eSSadaf Ebrahimi      }
52*ccdc9c3eSSadaf Ebrahimi    } else {
53*ccdc9c3eSSadaf Ebrahimi      if ($start >= 0) {
54*ccdc9c3eSSadaf Ebrahimi        push @ranges, [$start, $i-1];
55*ccdc9c3eSSadaf Ebrahimi      }
56*ccdc9c3eSSadaf Ebrahimi      $start = -1;
57*ccdc9c3eSSadaf Ebrahimi    }
58*ccdc9c3eSSadaf Ebrahimi  }
59*ccdc9c3eSSadaf Ebrahimi  return @ranges;
60*ccdc9c3eSSadaf Ebrahimi}
61*ccdc9c3eSSadaf Ebrahimi
62*ccdc9c3eSSadaf Ebrahimisub PrintClass($$@) {
63*ccdc9c3eSSadaf Ebrahimi  my ($cnum, $cname, @ranges) = @_;
64*ccdc9c3eSSadaf Ebrahimi  print "static const URange16 code${cnum}[] = {  /* $cname */\n";
65*ccdc9c3eSSadaf Ebrahimi  for (my $i=0; $i<@ranges; $i++) {
66*ccdc9c3eSSadaf Ebrahimi    my @a = @{$ranges[$i]};
67*ccdc9c3eSSadaf Ebrahimi    printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
68*ccdc9c3eSSadaf Ebrahimi  }
69*ccdc9c3eSSadaf Ebrahimi  print "};\n";
70*ccdc9c3eSSadaf Ebrahimi  my $n = @ranges;
71*ccdc9c3eSSadaf Ebrahimi  my $escname = $cname;
72*ccdc9c3eSSadaf Ebrahimi  $escname =~ s/\\/\\\\/g;
73*ccdc9c3eSSadaf Ebrahimi  $negname = $escname;
74*ccdc9c3eSSadaf Ebrahimi  if ($negname =~ /:/) {
75*ccdc9c3eSSadaf Ebrahimi    $negname =~ s/:/:^/;
76*ccdc9c3eSSadaf Ebrahimi  } else {
77*ccdc9c3eSSadaf Ebrahimi    $negname =~ y/a-z/A-Z/;
78*ccdc9c3eSSadaf Ebrahimi  }
79*ccdc9c3eSSadaf Ebrahimi  return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
80*ccdc9c3eSSadaf Ebrahimi}
81*ccdc9c3eSSadaf Ebrahimi
82*ccdc9c3eSSadaf Ebrahimimy $cnum = 0;
83*ccdc9c3eSSadaf Ebrahimi
84*ccdc9c3eSSadaf Ebrahimisub PrintClasses($@) {
85*ccdc9c3eSSadaf Ebrahimi  my ($pname, @classes) = @_;
86*ccdc9c3eSSadaf Ebrahimi  my @entries;
87*ccdc9c3eSSadaf Ebrahimi  foreach my $cname (@classes) {
88*ccdc9c3eSSadaf Ebrahimi    my @ranges = ComputeClass($cname);
89*ccdc9c3eSSadaf Ebrahimi    push @entries, PrintClass(++$cnum, $cname, @ranges);
90*ccdc9c3eSSadaf Ebrahimi  }
91*ccdc9c3eSSadaf Ebrahimi  print "const UGroup ${pname}_groups[] = {\n";
92*ccdc9c3eSSadaf Ebrahimi  foreach my $e (@entries) {
93*ccdc9c3eSSadaf Ebrahimi    print "\t$e,\n";
94*ccdc9c3eSSadaf Ebrahimi  }
95*ccdc9c3eSSadaf Ebrahimi  print "};\n";
96*ccdc9c3eSSadaf Ebrahimi  my $count = @entries;
97*ccdc9c3eSSadaf Ebrahimi  print "const int num_${pname}_groups = $count;\n";
98*ccdc9c3eSSadaf Ebrahimi}
99*ccdc9c3eSSadaf Ebrahimi
100*ccdc9c3eSSadaf Ebrahimiprint <<EOF;
101*ccdc9c3eSSadaf Ebrahimi// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
102*ccdc9c3eSSadaf Ebrahimi// make_perl_groups.pl >perl_groups.cc
103*ccdc9c3eSSadaf Ebrahimi
104*ccdc9c3eSSadaf Ebrahimi#include "re2/unicode_groups.h"
105*ccdc9c3eSSadaf Ebrahimi
106*ccdc9c3eSSadaf Ebrahiminamespace re2 {
107*ccdc9c3eSSadaf Ebrahimi
108*ccdc9c3eSSadaf EbrahimiEOF
109*ccdc9c3eSSadaf Ebrahimi
110*ccdc9c3eSSadaf EbrahimiPrintClasses("perl", @perlclasses);
111*ccdc9c3eSSadaf EbrahimiPrintClasses("posix", @posixclasses);
112*ccdc9c3eSSadaf Ebrahimi
113*ccdc9c3eSSadaf Ebrahimiprint <<EOF;
114*ccdc9c3eSSadaf Ebrahimi
115*ccdc9c3eSSadaf Ebrahimi}  // namespace re2
116*ccdc9c3eSSadaf EbrahimiEOF
117