#!/usr/local/bin/perl # # script for building the HTML search databases for the Help system's # Search capability. Typically, there is one search database # built for each product. As an argument, specify the pathname # of the directory subtree to be indexed. # # $Revision: 1.3.4.1 $ $Date: 2004/04/20 21:29:18 $ # $searchdbase = "index.db"; $verbosemode = 1; $debugmode = 0; $helproot = $ENV{PWD} . "/"; $subdir = "./"; @omit_items = (); # string expressions of files or directories to omit from the database $omit_count = 0; # process arguments $argcount = $#ARGV + 1; for ($i=0; $i < $argcount; ++$i) { if ($ARGV[$i] eq "-o") { if ($i == $argcount-1) { print "No output file specified.\n"; exit(1); } else { ++$i; $searchdbase = $ARGV[$i]; }; } elsif ($ARGV[$i] eq "-debug") { $debugmode = 1; } elsif ($ARGV[$i] eq "-nv") { $verbosemode = 0; } elsif ($ARGV[$i] eq "-omit") { if ($i == $argcount-1) { print "You did not specify the item to omit.\n"; exit(1); } else { ++$i; $omit_items[$omit_count++] = $ARGV[$i]; }; } else { $subdir = $ARGV[$i] . "/"; $subdir =~ s|//$|/|; }; }; # require 5; use DB_File; # Access DB databases use Fcntl; # Needed for above... use File::Find; # Directory searching $DB_File::DB_BTREE->{cachesize} = 10_000_000; # 10meg cache $DB_File::DB_BTREE->{psize} = 32*1024; # 32k pages undef $/; # Don't obey line boundaries $currentKey = 0; @exclusion_list = ("an", "the", "of" ); ############################################################################ # Delete old search database and attach %indexdb to database $rcode = unlink($subdir . $searchdbase); if ($rcode == 0) {truncate($subdir . $searchdbase, 0);}; # can't delete, so truncate tie(%indexdb,'DB_File',$subdir . $searchdbase, O_RDWR | O_CREAT, 0644, $DB_File::DB_BTREE); find(\&IndexFile,"$subdir"); &FlushWordCache(); untie(%indexdb); # release database ########################################################################### sub IndexFile { if(!-f) { return; } $myfilename = $File::Find::name; ## Omit certain files if (($myfilename =~ m|tocframe\.html| ) || ($myfilename =~ m|ixframe\.html| )) { return; }; ## Also, omit the file if it matches anything on the omit list. for ($i=0; $i <= $#omit_items ; $i++) { if ($myfilename =~ m|$omit_items[$i]| ) { return; }; }; if(/\.html?$/) { # Handle HTML files if ($verbosemode == 1) { print "$File::Find::name\n"; }; open(HTML_FILE,$_) || die "Can't open $_: $!"; my($text) = ; # Read entire file close(HTML_FILE); # If there is a "See Also" section, chop it off if ($text =~ m||) { # Remove from header to next header or end of file. $text =~ s|||; $text .= ""; $text =~ s|.*?||s; }; # Remove material in bottom page footer if ($text =~ m||) { # Remove from header to end of file. $text =~ s|||; }; # Remove material in FRAMESET files if ($text =~ m||) { # Remove material between HEAD tags. $text =~ s|||s; # Remove material between SCRIPT tags. $text =~ s|||s; }; # Strip out all HTML tags $text =~ s/<[^>]*>//g; # Convert non-breaking space entities to spaces $text =~ s| | |g; # Index all the words under the current key # Map key to this filename # $indexdb{pack("xn",$currentKey)} = $File::Find::name; $myfilename = $File::Find::name; $fullpathname = $helproot . $myfilename; open(INFILE, $fullpathname) || die "cannot open $fullpathname for reading\n"; $count = read(INFILE, $firstline, 5000); $docname = ""; $docnameStartPos = index($firstline, ""); $docname = substr($docnameline, 0, $docnameEndPos); $docname =~ s|^ *||; $docname =~ s| *$||; }; $title = ""; $skip_this_one = 0; if ($firstline =~ m|.*|i) { $title = $&; if (($title =~ m|: Table of Contents|) || ($title =~ m|: Index|)) { $skip_this_one = 1; }; $title =~ s|<.*?>||g; if (($docname eq "") && ($title =~ m|.*\(.*\) *?$|)) { $docname = $&; $docname =~ s|^.*?\(||; $docname =~ s|\) *?$||; }; $title =~ s|\(.*?\)||; $title =~ s| *:: *$||; }; $title2 = ""; if ($firstline =~ m||) { $skip_this_one = 1; }; $title2 =~ s|||; } elsif ($title =~ m| :: |) { $title2 = $title; $title2 =~ s|^.*? :: *||; } $title3 = ""; if ($firstline =~ m|||; } elsif ($firstline =~ m||) { $title3 = $&; $title3 =~ s|||; } elsif ($title =~ m| :: |) { $title3 = $title; $title3 =~ s| :: .*||; } elsif ($firstline =~ m|