#!/usr/local/bin/perl # ice2-idx.pl - create index file # # ICE Version 1.5 beta 3 rev2 # March 2000 # (C) Christian Neuss / fax (425) 732-7343 # Have this script called up on a regular base via 'cron'. # If thats not possible, re-index manually whenever changes # to your document hierarchy have been made. # Make sure that if for security reasons, it is being executed # with a user id other then root, this user has # both read access to the html files and write access to # the index file. # Here's an example of a crontab entry (crontab syntax varies # between different platforms): # 1 20 * * * neuss /usr/httpd/ice-idx.pl >/dev/console 2>&1 #--- start of configuration --- put your changes here --- # NOTE: Depending on your Perl implementation, you may # have to use different path seperators in the following # paths when you are on a Macintosh or PC system. In that # case, a path may look like e.g. "usr:foo:bar" (Mac), or # "\\usr\\foo\\bar" resp. '\usr\foo\bar' (PC). # The physical directory/directories to scan for html files. # Example: # @SEARCHDIRS=('/usr/www/dir','/tmp/html','/usr/foo/html-dir'); @SEARCHDIRS=( # "/www/htdocs/domains/domain3/00095/www.wildboar.net/webdocs", # "/users/neuss/.public_html", "E:\\wildboar", ); # Location of the index file. # Example: # $INDEXFILE='/usr/local/httpd/index.idx'; # $INDEXFILE='/www/htdocs/domains/domain3/00095/www.wildboar.net/webdocs/cgi/ice/index.idx'; $INDEXFILE='E:\\wildboar\cgi\ice\ice1-5\index.idx'; # The ICE indexer will support full international characters by # converting them to a canonical form if $ISO is set to "y". For # servers that contain english text only, you can improve indexing # speed by setting $ISO to "n". $ISO="y"; # Type of system (for figuring out the path delimiting character) # that ice-idx.pl runs on. Select one of "UNIX", "MAC", or "PC" # Important: If you use NT, depending on the Perl binary, the # correct setting can be eith PC of UNIX! $TYPE="UNIX"; # Minimum length of word to be indexed $MINLEN=3; # Stop indexing a word that appears in over X percent of all files $MAXPERCENT=60; # File suffixes to index (regular expression) $SUFFIXES='\.(rtf|[sp]?html?|txt|mail)$'; #--- end of configuration --- don't change anything below --- require "find.pl"; local(@allfiles,%freqlist); open(INDEX,">$INDEXFILE") || die("Cannot open $INDEXFILE: $!\n"); &find (@SEARCHDIRS); $count=0; foreach $name (@allfiles){ print STDERR "indexing [$name]\n"; $lastpercent=$percent; $percent=int(100*$count/@allfiles); if($percent>$lastpercent){ print STDERR $percent,"% ";} &indexfile($name); $count++; # every 100th file until the 1000th... if((($count % 100) == 0) && ($count >= 200) && ($count < 1200)){ # remove the most frequent words so far from the index &removefrequent; } } &removefrequent; # print sorted list of words and their fileids foreach $w (sort keys(%index)){ print INDEX "$w ",$index{$w},"\n"; } print INDEX "--\n"; # print list of all files and their fileid local($dir,$prevdir,$name); foreach $w (sort keys(%files)){ if($files{$w} =~ m:(.*)/([^/]*)$:){ $prevdir = $dir; $name = $2; $dir = $1; if($prevdir ne $dir){ print INDEX "$dir\n"; } $title = $titles{$w}; $mtime = $mtimes{$w}; print INDEX "$w $name /$mtime $title\n"; } } ### system("ps -vx | egrep 'perl|MEM'"); sub wanted { if($name=~/$SUFFIXES/i){ # file name ends push(@allfiles,$name); } } # modifies %files sub removefrequent{ local($num,$tmp); $numfiles = keys(%files); foreach $w (keys(%index)){ ($tmp = $index{$w}) =~ s/[^ ]//g; $num = length($tmp); # don't index words in more then X % of the files if($num*100 > $MAXPERCENT*$numfiles){ print STDERR ("removing common word: $w [$num of $numfiles]\n"); $index{$w}="0"; } } } sub indexfile{ local($file)=@_; local($title,$intitle,$freq); # PJ - no directories return if -d $file; unless (-r $file && open(fpInput,"$file")){ # file readable? print STDERR "cannot read file [$file]\n"; ### XXX no printo return; } $fileno++; $fileid = sprintf ("%X ",$fileno); $files{$fileid}=$file; local($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,@dontcare); ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,@dontcare) = stat($file); # strip html tags? local($ishtml)=0; local($ishtmlregexp)='\.([sp]?html?)$'; $ishtml=1 if $file=~m!$ishtmlregexp!i; # set input separator to the tag close character ">" $/ = ">"; while(){ s/ \;/ /ig; s/\s+/ /g; # fold whitespaces into a single blank s/([^\n])([^\n])/>\n$1/g; # .. and after every '>' foreach (split(/\n/,$_)){ # opening title tag if(m::i){ $intitle="y"; $title=""; } # closing title tag if(m::i){ $intitle=""; } # strip spurious tag delimeters s![<>]! !go if (!($ishtml)); # outside a tag or inside META tag => index word ## PJ: BUG - we also want to index non-html ## so do some guessing to enable this ## (above: try first lines to extract title from ascii) ## (prefer subject: if exists) if(!//i ) { $_ = $1; ## print "FOUND META TAG $_\n"; } if( $ISO eq "y" && /[&\xc0-\xff]/){ # convert html special chars and iso 8bit to text $_ = &html2text($_); } # if inside title if ($intitle){ tr/a-zA-Z\xc0-\xff0-9\-/ /cs; $title.="$_"; } else { # the following line defines what you consider a "printable" char tr/a-zA-Z\xc0-\xff/ /cs; foreach (split(/ /,$_)){ next unless (length($_)>=$MINLEN); # if too short skip if (/\;$/) { # get rid of trailing ";" that aren't part of &Xuml; s/((\w|\&[a-z,A-Z]+\;)+)\;?/$1/; } if(/^[A-Z][^A-Z]*$/){ # "Someword" to "someword" tr/A-Z/a-z/; } ###print "3. [$_]\n"; $freqlist{$_}++; if(/[A-Z]/) { # store abbr. as all-lower, too tr/A-Z/a-z/; $freqlist{$_}++; } } } } } } $file =~ tr/\n/ /s; # convert MAC and PC path seperators to UNIX style slashes if($TYPE eq "MAC"){ $file =~ s|:|/|g; } if($TYPE eq "PC") { $file =~ s|\\|/|g; } # on a MAC, add the leading slash if ($file =~ m/^[^\/]/) { $file = "/$file"; } $title =~ tr/\n/ /s; ### print INDEX "\@f $file\n"; ### print INDEX "\@t $title\n"; ### print INDEX "\@m $mtime\n"; foreach $w (sort keys(%freqlist)){ ###print INDEX "$freqlist{$w} $w\n"; if($index{$w} ne "0"){ $freq = $freqlist{$w}; $freq .= ":" unless length($freq)==1; $index{$w} .= $freq.$fileid; } ### print "4. $freqlist{$w} $w\n"; } $titles{$fileid}=$title; $mtimes{$fileid}=$mtime; undef %freqlist; close(fpInput); } # iso2html - translate iso 8 bit characters to HTML # # Thanks to # Pierre Cormier (cormier.pierre@uqam.ca) # Universite du Quebec Montreal sub initTables { foreach (0..191) { $isohtml[$_] = pack("C",$_);} $isohtml[hex('c0')] = 'À'; $isohtml[hex('c1')] = 'Á'; $isohtml[hex('c2')] = 'Â'; $isohtml[hex('c3')] = 'Ã'; $isohtml[hex('c4')] = 'Ä'; $isohtml[hex('c5')] = 'Å'; $isohtml[hex('c6')] = 'Æ'; $isohtml[hex('c7')] = 'Ç'; $isohtml[hex('c8')] = 'È'; $isohtml[hex('c9')] = 'É'; $isohtml[hex('ca')] = 'Ê'; $isohtml[hex('cb')] = 'Ë'; $isohtml[hex('cc')] = 'Ì'; $isohtml[hex('cd')] = 'Í'; $isohtml[hex('ce')] = 'Î'; $isohtml[hex('cf')] = 'Ï'; $isohtml[hex('d0')] = 'Ð'; $isohtml[hex('d1')] = 'Ñ'; $isohtml[hex('d2')] = 'Ò'; $isohtml[hex('d3')] = 'Ó'; $isohtml[hex('d4')] = 'Ô'; $isohtml[hex('d5')] = 'Õ'; $isohtml[hex('d6')] = 'Ö'; $isohtml[hex('d7')] = '×'; $isohtml[hex('d8')] = 'Ø'; $isohtml[hex('d9')] = 'Ù'; $isohtml[hex('da')] = 'Ú'; $isohtml[hex('db')] = 'Û'; $isohtml[hex('dc')] = 'Ü'; $isohtml[hex('dd')] = 'Ý'; $isohtml[hex('de')] = 'Þ'; $isohtml[hex('df')] = 'ß'; $isohtml[hex('e0')] = 'à'; $isohtml[hex('e1')] = 'á'; $isohtml[hex('e2')] = 'â'; $isohtml[hex('e3')] = 'ã'; $isohtml[hex('e4')] = 'ä'; $isohtml[hex('e5')] = 'å'; $isohtml[hex('e6')] = 'æ'; $isohtml[hex('e7')] = 'ç'; $isohtml[hex('e8')] = 'è'; $isohtml[hex('e9')] = 'é'; $isohtml[hex('ea')] = 'ê'; $isohtml[hex('eb')] = 'ë'; $isohtml[hex('ec')] = 'ì'; $isohtml[hex('ed')] = 'í'; $isohtml[hex('ee')] = 'î'; $isohtml[hex('ef')] = 'ï'; $isohtml[hex('f0')] = 'ð'; $isohtml[hex('f1')] = 'ñ'; $isohtml[hex('f2')] = 'ò'; $isohtml[hex('f3')] = 'ó'; $isohtml[hex('f4')] = 'ô'; $isohtml[hex('f5')] = 'õ'; $isohtml[hex('f6')] = 'ö'; $isohtml[hex('f7')] = '&DIVIS;'; $isohtml[hex('f8')] = 'ø'; $isohtml[hex('f9')] = 'ù'; $isohtml[hex('fa')] = 'ú'; $isohtml[hex('fb')] = 'û'; $isohtml[hex('fc')] = 'ü'; $isohtml[hex('fd')] = 'ý'; $isohtml[hex('fe')] = 'þ'; $isohtml[hex('ff')] = 'ÿ'; # preset iso2text variable settings foreach (0..191) { $iso2text[$_] = pack("C",$_);} foreach (hex('c0')..hex('ff')) { $iso2text[$_] = substr($isohtml[$_],1,1); } # now assign exceptions: $iso2text[hex('c4')] = 'Ae'; $iso2text[hex('c6')] = 'AE'; $iso2text[hex('d0')] = 'ETH'; # ??? $iso2text[hex('d6')] = 'Oe'; $iso2text[hex('d7')] = 'x'; $iso2text[hex('dc')] = 'Ue'; $iso2text[hex('de')] = 'Th'; # thorn ??? $iso2text[hex('df')] = 'sz'; $iso2text[hex('e4')] = 'ae'; $iso2text[hex('e6')] = 'ae'; $iso2text[hex('f6')] = 'oe'; # Divis? $iso2text[hex('f7')] = 'D'; # Divis? $iso2text[hex('fc')] = 'ue'; $iso2text[hex('fe')] = 'th'; # thorn # set html2iso variable foreach (1..255) { $html2iso{$isohtml[$_]}=pack("C",$_);; } } sub iso2html { local($input)=@_; unless(defined($isohtml[0])){ &initTables; } local(@car) = split(//,$input); local($output); foreach (@car) { $output .= $isohtml[ord($_)]; } $output; } sub iso2text { local($input)=@_; unless(defined($isohtml[0])){ &initTables; } local(@car) = split(//,$input); local($output); foreach (@car) { $output .= $iso2text[ord($_)]; } $output; } sub html2iso { local($input)=@_; unless(defined($isohtml[0])){ &initTables; } local(@car) = split(/;/,$input); local($output); foreach (@car) { if(/(.*)&(.*)/){ $output .= $1; $output .= $html2iso{"&$2;"}; }else{ $output .= $_; } } $output; } sub html2text { return &iso2text(&html2iso(@_)); }