1#! /usr/bin/perl -w 2 3# Script to turn PCRE2 man pages into HTML 4 5 6# Subroutine to handle font changes and other escapes 7 8sub do_line { 9my($s) = $_[0]; 10 11$s =~ s/</</g; # Deal with < and > 12$s =~ s/>/>/g; 13$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g; 14$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g; 15$s =~ s"\\e"\\"g; 16$s =~ s/(?<=Copyright )\(c\)/©/g; 17$s; 18} 19 20# Subroutine to ensure not in a paragraph 21 22sub end_para { 23if ($inpara) 24 { 25 print TEMP "</PRE>\n" if ($inpre); 26 print TEMP "</P>\n"; 27 } 28$inpara = $inpre = 0; 29$wrotetext = 0; 30} 31 32# Subroutine to start a new paragraph 33 34sub new_para { 35&end_para(); 36print TEMP "<P>\n"; 37$inpara = 1; 38} 39 40 41# Main program 42 43$innf = 0; 44$inpara = 0; 45$inpre = 0; 46$wrotetext = 0; 47$toc = 0; 48$ref = 1; 49 50while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) 51 { 52 $toc = 1 if $ARGV[0] eq "-toc"; 53 shift; 54 } 55 56# Initial output to STDOUT 57 58print <<End ; 59<html> 60<head> 61<title>$ARGV[0] specification</title> 62</head> 63<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> 64<h1>$ARGV[0] man page</h1> 65<p> 66Return to the <a href="index.html">PCRE2 index page</a>. 67</p> 68<p> 69This page is part of the PCRE2 HTML documentation. It was generated 70automatically from the original man page. If there is any nonsense in it, 71please consult the man page, in case the conversion went wrong. 72<br> 73End 74 75print "<ul>\n" if ($toc); 76 77open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n"; 78 79while (<STDIN>) 80 { 81 # Handle lines beginning with a dot 82 83 if (/^\./) 84 { 85 # Some of the PCRE2 man pages used to contain instances of .br. However, 86 # they should have all been removed because they cause trouble in some 87 # (other) automated systems that translate man pages to HTML. Complain if 88 # we find .br or .in (another macro that is deprecated). 89 90 if (/^\.br/ || /^\.in/) 91 { 92 print STDERR "\n*** Deprecated macro encountered - rewrite needed\n"; 93 print STDERR "*** $_\n"; 94 die "*** Processing abandoned\n"; 95 } 96 97 # Instead of .br, relevant "literal" sections are enclosed in .nf/.fi. 98 99 elsif (/^\.nf/) 100 { 101 $innf = 1; 102 } 103 104 elsif (/^\.fi/) 105 { 106 $innf = 0; 107 } 108 109 # Handling .sp is subtle. If it is inside a literal section, do nothing if 110 # the next line is a non literal text line; similarly, if not inside a 111 # literal section, do nothing if a literal follows, unless we are inside 112 # a .nf/.fi section or about to enter one. The point being that the <pre> 113 # and </pre> that delimit literal sections will do the spacing. Always skip 114 # if no previous output. 115 116 elsif (/^\.sp/) 117 { 118 if ($wrotetext) 119 { 120 $_ = <STDIN>; 121 if ($inpre) 122 { 123 print TEMP "\n" if (/^[\s.]/); 124 } 125 else 126 { 127 print TEMP "<br>\n<br>\n" if ($innf || /^\.nf/ || !/^[\s.]/); 128 } 129 redo; # Now process the lookahead line we just read 130 } 131 } 132 elsif (/^\.TP/ || /^\.PP/ || /^\.P/) 133 { 134 &new_para(); 135 } 136 elsif (/^\.SH\s*("?)(.*)\1/) 137 { 138 # Ignore the NAME section 139 if ($2 =~ /^NAME\b/) 140 { 141 <STDIN>; 142 next; 143 } 144 145 &end_para(); 146 my($title) = &do_line($2); 147 if ($toc) 148 { 149 printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n", 150 $ref, $ref); 151 printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n", 152 $ref); 153 $ref++; 154 } 155 else 156 { 157 print TEMP "<br><b>\n$title\n</b><br>\n"; 158 } 159 } 160 elsif (/^\.SS\s*("?)(.*)\1/) 161 { 162 &end_para(); 163 my($title) = &do_line($2); 164 print TEMP "<br><b>\n$title\n</b><br>\n"; 165 } 166 elsif (/^\.B\s*(.*)/) 167 { 168 &new_para() if (!$inpara); 169 $_ = &do_line($1); 170 s/"(.*?)"/$1/g; 171 print TEMP "<b>$_</b>\n"; 172 $wrotetext = 1; 173 } 174 elsif (/^\.I\s*(.*)/) 175 { 176 &new_para() if (!$inpara); 177 $_ = &do_line($1); 178 s/"(.*?)"/$1/g; 179 print TEMP "<i>$_</i>\n"; 180 $wrotetext = 1; 181 } 182 183 # Remove the "AUTOMATICALLY GENERATED" warning from pcre2demo.3 184 elsif (/^\.\\"AUTOMATICALLY GENERATED/) { next; } 185 186 # A comment that starts "HREF" takes the next line as a name that 187 # is turned into a hyperlink, using the text given, which might be 188 # in a special font. If it ends in () or (digits) or punctuation, they 189 # aren't part of the link. 190 191 elsif (/^\.\\"\s*HREF/) 192 { 193 $_=<STDIN>; 194 chomp; 195 $_ = &do_line($_); 196 $_ =~ s/\s+$//; 197 $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/; 198 print TEMP "<a href=\"$1.html\">$_</a>\n"; 199 } 200 201 # A comment that starts "HTML" inserts literal HTML 202 203 elsif (/^\.\\"\s*HTML\s*(.*)/) 204 { 205 print TEMP $1; 206 } 207 208 # A comment that starts < inserts that HTML at the end of the 209 # *next* input line - so as not to get a newline between them. 210 211 elsif (/^\.\\"\s*(<.*>)/) 212 { 213 my($markup) = $1; 214 $_=<STDIN>; 215 chomp; 216 $_ = &do_line($_); 217 $_ =~ s/\s+$//; 218 print TEMP "$_$markup\n"; 219 } 220 221 # A comment that starts JOIN joins the next two lines together, with one 222 # space between them. Then that line is processed. This is used in some 223 # displays where two lines are needed for the "man" version. JOINSH works 224 # the same, except that it assumes this is a shell command, so removes 225 # continuation backslashes. 226 227 elsif (/^\.\\"\s*JOIN(SH)?/) 228 { 229 my($one,$two); 230 $one = <STDIN>; 231 $two = <STDIN>; 232 $one =~ s/\s*\\e\s*$// if (defined($1)); 233 chomp($one); 234 $two =~ s/^\s+//; 235 $_ = "$one $two"; 236 redo; # Process the joined lines 237 } 238 239 # .EX/.EE are used in the pcre2demo page to bracket the entire program, 240 # which is unmodified except for turning backslash into "\e". 241 242 elsif (/^\.EX\s*$/) 243 { 244 print TEMP "<PRE>\n"; 245 while (<STDIN>) 246 { 247 last if /^\.EE\s*$/; 248 s/\\e/\\/g; 249 s/&/&/g; 250 s/</</g; 251 s/>/>/g; 252 print TEMP; 253 } 254 } 255 256 # Ignore anything not recognized 257 258 next; 259 } 260 261 # Line does not begin with a dot. Replace blank lines with new paragraphs 262 263 if (/^\s*$/) 264 { 265 &end_para() if ($wrotetext); 266 next; 267 } 268 269 # Convert fonts changes and output an ordinary line. Ensure that indented 270 # lines are marked as literal. 271 272 $_ = &do_line($_); 273 &new_para() if (!$inpara); 274 275 if (/^\s/) 276 { 277 if (!$inpre) 278 { 279 print TEMP "<pre>\n"; 280 $inpre = 1; 281 } 282 } 283 elsif ($inpre) 284 { 285 print TEMP "</pre>\n"; 286 $inpre = 0; 287 } 288 289 # Add <br> to the end of a non-literal line if we are within .nf/.fi 290 291 $_ .= "<br>\n" if (!$inpre && $innf); 292 293 print TEMP; 294 $wrotetext = 1; 295 } 296 297# The TOC, if present, will have been written - terminate it 298 299print "</ul>\n" if ($toc); 300 301# Copy the remainder to the standard output 302 303close(TEMP); 304open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n"; 305 306print while (<TEMP>); 307 308print <<End ; 309<p> 310Return to the <a href="index.html">PCRE2 index page</a>. 311</p> 312End 313 314close(TEMP); 315unlink("/tmp/$$"); 316 317# End 318