3 ##---------------------------------------------------------------------------##
5 ## @(#) man2html 1.2 97/08/12 12:57:30 @(#)
7 ## Earl Hood, ehood@medusa.acs.uci.edu
9 ## man2html is a Perl program to convert formatted nroff output
12 ## Recommend command-line options based on platform:
15 ## ---------------------------------------------------------------------
16 ## c2mp <None, the defaults should be okay>
17 ## hp9000s700/800 -leftm 1 -topm 8
19 ## ---------------------------------------------------------------------
21 ##---------------------------------------------------------------------------##
22 ## Copyright (C) 1995-1997 Earl Hood, ehood@medusa.acs.uci.edu
24 ## This program is free software; you can redistribute it and/or modify
25 ## it under the terms of the GNU General Public License as published by
26 ## the Free Software Foundation; either version 2 of the License, or
27 ## (at your option) any later version.
29 ## This program is distributed in the hope that it will be useful,
30 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
31 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
32 ## GNU General Public License for more details.
34 ## You should have received a copy of the GNU General Public License
35 ## along with this program; if not, write to the Free Software
36 ## Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
38 ##---------------------------------------------------------------------------##
44 ($PROG = $0) =~ s/.*\///;
47 ## Input and outputs filehandles
48 $InFH = \*STDIN unless $InFH;
49 $OutFH = \*STDOUT unless $OutFH;
51 ## Backspace character: Used in overstriking detection
54 ## Hash of section titles and their HTML tag wrapper.
55 ## This list allows customization of what HTML tag is used for
56 ## a given section head.
58 ## The section title can be a regular expression. Therefore, one must
59 ## be careful about quoting special characters.
63 '\S.*OPTIONS.*' => '<H2>',
66 'COMPATIBILITY' => '<H2>',
67 'DEPENDENCIES' => '<H2>',
68 'DESCRIPTION' => '<H2>',
69 'DIAGNOSTICS' => '<H2>',
70 'ENVIRONMENT' => '<H2>',
73 'EXTERNAL INFLUENCES' => '<H2>',
75 'LIMITATIONS' => '<H2>',
79 'REFERENCES' => '<H2>',
80 'RETURN VALUE' => '<H2>',
81 'SECTION.*:' => '<H2>',
83 'STANDARDS CONFORMANCE' => '<H2>',
84 'STYLE CONVENTION' => '<H2>',
88 '\s+Section.*:' => '<H3>',
92 ## Fallback tag if above is not found
93 $HeadFallback = '<H2>';
97 $Bare = 0; # Skip printing HTML head/foot flag
98 $BTag = 'B'; # Overstrike tag
99 $CgiUrl = ''; # CGI URL expression
100 $Compress = 0; # Do blank line compression flag
101 $K = 0; # Do keyword search processing flag
102 $NoDepage = 0; # Do not strip page information
103 $NoHeads = 0; # Do no header detection flag
104 $SeeAlso = 0; # Do only SEE ALSO xrefs flag
105 $Solaris = 0; # Solaris keyword search processing flag
106 $Sun = 0; # Headers not overstriken flag
108 $UTag = 'I'; # Underline tag
109 $ftsz = 7; # Bottome margin size
110 $hdsz = 7; # Top margin size
111 $leftm = ''; # Left margin pad
112 $leftmsz = 0; # Left margin size
113 $pgsz = 66; # Size of page size
114 $txsz = 52; # Text body length size
116 #############################################################################
118 #############################################################################
120 if (get_cli_opts()) {
131 #############################################################################
133 #############################################################################
137 ## Define while loop and then eval it when used. The reason
138 ## is to avoid the regular expression reevaulation in the
139 ## section head detection code.
141 $doitcode =<<'EndOfDoItCode';
143 my($line, $tmp, $i, $head, $preindent, $see_also, $do);
145 $see_also = !$SeeAlso;
146 print $OutFH "<!-- Manpage converted by man2html $VERSION -->\n";
147 LOOP: while(!eof($InFH)) {
149 for ($i=0; $i < $hdsz; $i++) {
150 last LOOP unless defined($_ = <$InFH>);
152 for ($i=0; $i < $txsz; $i++) {
153 last LOOP unless defined($_ = <$InFH>);
155 ## Check if compress consecutive blank lines
156 if ($Compress and !/\S/) {
157 if ($blank) { next; } else { $blank = 1; }
162 ## Try to check if line space is needed at page boundaries ##
163 if (!$NoDepage && ($i==0 || $i==($txsz-1)) && !/^\s*$/) {
164 /^(\s*)/; $tmp = length($1);
166 if ($tmp < $preindent) { print $OutFH "\n"; }
172 $do = 0; $preindent = 0;
177 entitize(\$_); # Convert [$<>] to entity references
179 ## Check for 'SEE ALSO' link only
180 if (!$see_also && $CgiUrl && $SeeAlso) {
181 ($tmp = $line) =~ s/.\010//go;
182 if ($tmp =~ /^\s*SEE\s+ALSO\s*$/o) { $see_also = 1; }
183 else { $see_also = 0; }
186 ## Create anchor links for manpage references
187 s/((((.\010)+)?[\+_\.\w-])+\(((.\010)+)?
192 ## Emphasize underlined words
193 # s/((_\010[^_])+[\.\(\)_]?(_\010[^_])+\)?)/emphasize($1)/oge;
194 # s/((_\010[^_])+([\.\(\)_]?(_\010[^_])+)?)/emphasize($1)/oge;
196 # The previous expressions were trying to be clever about
197 # detecting underlined text which contain non-alphanumeric
198 # characters. nroff will not underline non-alphanumeric
199 # characters in an underlined phrase, and the above was trying
200 # to detect that. It does not work all the time, and it
201 # screws up other text, so a simplified expression is used.
203 s/((_\010[^_])+)/emphasize($1)/oge;
206 ## Check for strong text and headings
207 if ($Sun || /.\010./o) {
209 $line =~ s/.\010//go;
210 $tmp = $HeadFallback;
213 ## Create switch statement for detecting a heading
215 $doitcode .= "HEADSW: {\n";
216 foreach $head (keys %SectionHead) {
217 $doitcode .= join("", "\$tmp = '$SectionHead{$head}', ",
218 "\$secth = 1, last HEADSW ",
219 "if \$line =~ /^$leftm$head/o;\n");
225 $doitcode .=<<'EndOfDoItCode';
226 if ($secth || $line =~ /^$leftm\S/o) {
228 $_ = $tmp . $line . $tmp;
230 $_ = "\n</PRE>\n" . $_ . "<PRE>\n";
232 s/(((.\010)+.)+)/strongize($1)/oge;
235 s/(((.\010)+.)+)/strongize($1)/oge;
241 for ($i=0; $i < $ftsz; $i++) {
242 last LOOP unless defined($_ = <$InFH>);
248 ## Perform processing.
250 printhead() unless $Bare;
251 print $OutFH "<PRE>\n";
252 eval $doitcode; # $doitcode defined above
253 print $OutFH "</PRE>\n";
254 printtail() unless $Bare;
257 ##---------------------------------------------------------------------------
262 "bare", # Leave out HTML, HEAD, BODY tags.
263 "belem=s", # HTML Element for overstriked text (def: "B")
264 "botm=i", # Number of lines for bottom margin (def: 7)
265 "cgiurl=s", # CGI URL for linking to other manpages
266 "cgiurlexp=s", # CGI URL Perl expr for linking to other manpages
267 "compress", # Compress consecutive blank lines
268 "headmap=s", # Filename of user section head map file
269 "k", # Process input from 'man -k' output.
270 "leftm=i", # Character width of left margin (def: 0)
271 "nodepage", # Do not remove pagination lines
272 "noheads", # Do not detect for section heads
273 "pgsize=i", # Number of lines in a page (def: 66)
274 "seealso", # Link to other manpages only in the SEE ALSO section
275 "solaris", # Parse 'man -k' output from a solaris system
276 "sun", # Section heads are not overstriked in input
277 "title=s", # Title of manpage (def: Not defined)
278 "topm=i", # Number of lines for top margin (def: 7)
279 "uelem=s", # HTML Element for underlined text (def: "I")
281 "help" # Short usage message
283 return 0 if defined($opt_help);
285 $pgsz = $opt_pgsize || $pgsz;
286 if (defined($opt_nodepage)) {
290 $hdsz = $opt_topm if defined($opt_topm);
291 $ftsz = $opt_botm if defined($opt_botm);
293 $txsz = $pgsz - ($hdsz + $ftsz);
294 $leftmsz = $opt_leftm if defined($opt_leftm);
295 $leftm = ' ' x $leftmsz;
297 $Bare = defined($opt_bare);
298 $Compress = defined($opt_compress);
299 $K = defined($opt_k);
300 $NoDepage = defined($opt_nodepage);
301 $NoHeads = defined($opt_noheads);
302 $SeeAlso = defined($opt_seealso);
303 $Solaris = defined($opt_solaris);
304 $Sun = defined($opt_sun);
306 $Title = $opt_title || $Title;
307 $CgiUrl = $opt_cgiurlexp ||
308 ($opt_cgiurl ? qq{return "$opt_cgiurl"} : '');
310 $BTag = $opt_belem || $BTag;
311 $UTag = $opt_uelem || $UTag;
315 if (defined($opt_headmap)) {
316 require $opt_headmap or warn "Unable to read $opt_headmap\n";
321 ##---------------------------------------------------------------------------
323 print $OutFH "<HTML>\n";
324 print $OutFH "<HEAD>\n",
325 "<TITLE>$Title</TITLE>\n",
326 "</HEAD>\n" if $Title;
327 print $OutFH "<BODY>\n";
328 print $OutFH "<H1>$Title</H1>\n",
332 ##---------------------------------------------------------------------------
334 print $OutFH <<EndOfRef;
337 Man(1) output converted with
338 <a href="http://www.oac.uci.edu/indiv/ehood/man2html.html">man2html</a>
345 ##---------------------------------------------------------------------------
349 $txt = "<$UTag>$txt</$UTag>";
353 ##---------------------------------------------------------------------------
357 $txt = "<$BTag>$txt</$BTag>";
361 ##---------------------------------------------------------------------------
365 ## Check for special characters in overstrike text ##
366 $$txt =~ s/_\010\&/strike('_', '&')/geo;
367 $$txt =~ s/_\010</strike('_', '<')/geo;
368 $$txt =~ s/_\010>/strike('_', '>')/geo;
370 $$txt =~ s/(\&\010)+\&/strike('&', '&')/geo;
371 $$txt =~ s/(<\010)+</strike('<', '<')/geo;
372 $$txt =~ s/(>\010)+>/strike('>', '>')/geo;
374 ## Check for special characters in regular text. Must be careful
375 ## to check before/after character in expression because it might be
376 ## a special character.
377 $$txt =~ s/([^\010]\&[^\010])/htmlize2($1)/geo;
378 $$txt =~ s/([^\010]<[^\010])/htmlize2($1)/geo;
379 $$txt =~ s/([^\010]>[^\010])/htmlize2($1)/geo;
382 ##---------------------------------------------------------------------------
383 ## escape special characters in a string, in-place
387 $$str =~ s/&/\&/g;
388 $$str =~ s/</\</g;
389 $$str =~ s/>/\>/g;
393 ##---------------------------------------------------------------------------
394 ## htmlize2() is used by entitize.
398 $str =~ s/&/\&/g;
404 ##---------------------------------------------------------------------------
405 ## strike converts HTML special characters in overstriked text
406 ## into entity references. The entities are overstriked so
407 ## strongize() and emphasize() will recognize the entity to be
415 $ret = "_$bs\&_${bs}a_${bs}m_${bs}p_${bs};";
416 } elsif ($char eq '<') {
417 $ret = "_$bs\&_${bs}l_${bs}t_${bs};";
418 } elsif ($char eq '>') {
419 $ret = "_$bs\&_${bs}g_${bs}t_${bs};";
421 warn qq|Unrecognized character, "$char", passed to strike()\n|;
425 $ret = "\&$bs\&a${bs}am${bs}mp${bs}p;${bs};";
426 } elsif ($char eq '<') {
427 $ret = "\&$bs\&l${bs}lt${bs}t;${bs};";
428 } elsif ($char eq '>') {
429 $ret = "\&$bs\&g${bs}gt${bs}t;${bs};";
431 warn qq|Unrecognized character, "$char", passed to strike()\n|;
437 ##---------------------------------------------------------------------------
438 ## make_xref() converts a manpage crossreference into a hyperlink.
442 $str =~ s/.\010//go; # Remove overstriking
445 my($title,$section,$subsection) =
446 ($str =~ /([\+_\.\w-]+)\((\d)(\w?)\)/);
448 $title =~ s/\+/%2B/g;
449 my($href) = (eval $CgiUrl);
450 qq|<B><A HREF="$href">$str</A></B>|;
456 ##---------------------------------------------------------------------------
457 ## man_k() process a keyword search. The problem we have is there
458 ## is no standard for keyword search results from man. Solaris
459 ## systems have a different enough format to warrent dealing
460 ## with it as a special case. For other cases, we try our best.
461 ## Unfortunately, there are some lines of results that may be
465 my($line,$refs,$section,$subsection,$desc,$i,
466 %Sec1, %Sec1sub, %Sec2, %Sec2sub, %Sec3, %Sec3sub,
467 %Sec4, %Sec4sub, %Sec5, %Sec5sub, %Sec6, %Sec6sub,
468 %Sec7, %Sec7sub, %Sec8, %Sec8sub, %Sec9, %Sec9sub,
469 %SecN, %SecNsub, %SecNsec);
471 printhead() unless $Bare;
472 print $OutFH "<!-- Man keyword results converted by ",
473 "man2html $VERSION -->\n";
475 while ($line = <$InFH>) {
476 next if $line !~ /\(\d\w?\)\s+-\s/; # check if line can be handled
477 ($refs,$section,$subsection,$desc) =
478 $line =~ /^\s*(.*)\((\d)(\w?)\)\s*-\s*(.*)$/;
481 $refs =~ s/^\s*([\+_\.\w-]+)\s+([\+_\.\w-]+)\s*$/$1/;
484 $refs =~ s/\s(and|or)\s/,/gi; # Convert and/or to commas
485 $refs =~ s/^[^:\s]:\s*//; # Remove prefixed whatis path
487 $refs =~ s/\s//g; # Remove all whitespace
488 $refs =~ s/,/, /g; # Put space after comma
489 htmlize(\$desc); # Check for special chars in desc
490 $desc =~ s/^(.)/\U$1/; # Uppercase first letter in desc
492 if ($section eq '1') {
493 $Sec1{$refs} = $desc; $Sec1sub{$refs} = $subsection;
494 } elsif ($section eq '2') {
495 $Sec2{$refs} = $desc; $Sec2sub{$refs} = $subsection;
496 } elsif ($section eq '3') {
497 $Sec3{$refs} = $desc; $Sec3sub{$refs} = $subsection;
498 } elsif ($section eq '4') {
499 $Sec4{$refs} = $desc; $Sec4sub{$refs} = $subsection;
500 } elsif ($section eq '5') {
501 $Sec5{$refs} = $desc; $Sec5sub{$refs} = $subsection;
502 } elsif ($section eq '6') {
503 $Sec6{$refs} = $desc; $Sec6sub{$refs} = $subsection;
504 } elsif ($section eq '7') {
505 $Sec7{$refs} = $desc; $Sec7sub{$refs} = $subsection;
506 } elsif ($section eq '8') {
507 $Sec8{$refs} = $desc; $Sec8sub{$refs} = $subsection;
508 } elsif ($section eq '9') {
509 $Sec9{$refs} = $desc; $Sec9sub{$refs} = $subsection;
511 $SecN{$refs} = $desc; $SecNsec{$refs} = $section;
512 $SecNsub{$refs} = $subsection;
515 print_mank_sec(\%Sec1, 1, \%Sec1sub);
516 print_mank_sec(\%Sec2, 2, \%Sec2sub);
517 print_mank_sec(\%Sec3, 3, \%Sec3sub);
518 print_mank_sec(\%Sec4, 4, \%Sec4sub);
519 print_mank_sec(\%Sec5, 5, \%Sec5sub);
520 print_mank_sec(\%Sec6, 6, \%Sec6sub);
521 print_mank_sec(\%Sec7, 7, \%Sec7sub);
522 print_mank_sec(\%Sec8, 8, \%Sec8sub);
523 print_mank_sec(\%Sec9, 9, \%Sec9sub);
524 print_mank_sec(\%SecN, 'N', \%SecNsub, \%SecNsec);
526 printtail() unless $Bare;
528 ##---------------------------------------------------------------------------
529 ## print_mank_sec() prints out manpage cross-refs of a specific section.
532 my($sec, $sect, $secsub, $secsec) = @_;
533 my(@array, @refs, $href, $item, $title, $subsection, $i, $section,
537 @array = sort keys %$sec;
539 print $OutFH "<H2>Section $section</H2>\n",
541 foreach $item (@array) {
542 @refs = split(/,/, $item);
543 $section = $secsec->{$item} if $sect eq 'N';
544 $subsection = $secsub->{$item};
546 ($title = $refs[0]) =~ s/\(\)//g; # watch out for extra ()'s
547 $xref = eval $CgiUrl;
549 print $OutFH "<DT>\n";
553 print $OutFH qq|<B><A HREF="$xref">$_</A></B>|;
557 print $OutFH ", " if $i < $#refs;
560 print $OutFH " ($section$subsection)\n",
562 $sec->{$item}, "</DD>\n";
564 print $OutFH "</DL>\n";
568 ##---------------------------------------------------------------------------
571 print $OutFH <<EndOfUsage;
572 Usage: $PROG [ options ] < infile > outfile
574 -bare : Do not put in HTML, HEAD, BODY tags
575 -belem <elem> : HTML Element for overstriked text (def: "B")
576 -botm <#> : Number of lines for bottom margin (def: 7)
577 -cgiurl <url> : URL for linking to other manpages
578 -cgiurlexp <url> : Perl expression URL for linking to other manpages
579 -compress : Compress consective blank lines
580 -headmap <file> : Filename of user section head map file
582 -k : Process a keyword search result
583 -leftm <#> : Character width of left margin (def: 0)
584 -nodepage : Do not remove pagination lines
585 -noheads : Turn off section head detection
586 -pgsize <#> : Number of lines in a page (def: 66)
587 -seealso : Link to other manpages only in the SEE ALSO section
588 -solaris : Process keyword search result in Solaris format
589 -sun : Section heads are not overstriked in input
590 -title <string> : Title of manpage (def: Not defined)
591 -topm <#> : Number of lines for top margin (def: 7)
592 -uelem <elem> : HTML Element for underlined text (def: "I")
595 $PROG takes formatted manpages from STDIN and converts it to HTML sent
596 to STDOUT. The -topm and -botm arguments are the number of lines to the
597 main body text and NOT to the running headers/footers.
601 Copyright (C) 1995-1997 Earl Hood, ehood\@medusa.acs.uci.edu
602 $PROG comes with ABSOLUTELY NO WARRANTY and $PROG may be copied only
603 under the terms of the GNU General Public License, which may be found in
604 the $PROG distribution.