#!/usr/local/bin/perl # # ice-idx.pl - create an index file // June 24, 1999 # # ICE Version 1.31 # (C) Christian Neuss (http://www.informatik.th-darmstadt.de/~neuss) # Have this script called up on a regular base via 'cron'. # If that's not possible, re-index manually whenever changes # to your document hierarchy have been made. # Make sure that if for security reasons it is being executed # with a user id other, than root, then this user has # both read access to the html files and write access to # the index file. # Here's an example of a crontab entry (crontab syntax varies # between different platforms): # 1 20 * * * neuss /usr/httpd/ice-idx.pl >/dev/console 2>&1 #--- start of configuration --- put your changes here --- # NOTE: Depending on your Perl implementation, you may # have to use different path separators in the following # paths when you are on a Macintosh or PC system. In that # case, a path may look like e.g. "usr:foo:bar" (Mac), or # "\\usr\\foo\\bar" resp. '\usr\foo\bar' (PC). # The physical directory/directories to scan for html files. # It's better to supply a tailing "/" for each directory, # since otherwise automounting may not work. # Example: # @SEARCHDIRS=('/usr/www/dir','/tmp/html','/usr/foo/html-dir'); # @SEARCHDIRS=( # '/www/htdocs/domains/domain3/00095/www.wildboar.net/webdocs', @SEARCHDIRS=( # 'E:\\wildboar', '', ); # Location of the index file. # Example: # $INDEXFILE='/usr/local/httpd/index.idx'; $INDEXFILE='E:\wildboar\cgi\ice\ice1-31\index.idx'; # $INDEXFILE='/www/htdocs/domains/domain3/00095/www.wildboar.net/webdocs/cgi/ice/ice1-31/index.idx'; # The ICE indexer will support full international characters by # converting them to their html equivalent, if $ISO is set. # This has a slightly negative impact on the indexing speed, so # set it to "y" only if you index files with 8-bit international # characters. OTHERWISE DON'T! iso2html seems to cause a memory # leak, causing the indexer to run forever. I'm working on it. $ISO="y"; # Type of system (for figuring out the path delimiting character) # that ice-idx.pl runs on. Select one of "UNIX", "MAC", or "PC" $TYPE="UNIX"; # Minimum length of word to be indexed $MINLEN=3; #--- end of configuration --- don't change anything below --- require "find.pl"; local(@allfiles,%freqlist); open(INDEX,">$INDEXFILE") || die("Cannot open $INDEXFILE: $!\n"); &find (@SEARCHDIRS); foreach $name (@allfiles){ ### print STDERR "indexing [$name]\n"; &indexfile($name); } ### system("ps -vx | egrep 'perl|MEM'"); sub wanted { if(/\.html$/ || /\.htm$/){ # file name ends .html or .htm push(@allfiles,$name); } } sub indexfile{ local($file)=@_; local($title,$intitle); unless (-r $file && open(fpin,"$file")){ # file readable? print STDERR "cannot read file [$file]\n"; ### XXX no printo return; } local($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,@dontcare) = stat($file); # set input separator to the tag close character ">" $/ = ">"; while(){ s/\s+/ /g; # fold whitespaces into a single blank s/([^\n])([^\n])/>\n\1/g; # .. and after every '>' foreach (split(/\n/,$_)){ # opening title tag if(m::i){ $intitle="y"; $title=""; } # closing title tag if(m::i){ $intitle=""; } # outside a tag => index word if(!/=$MINLEN); # if too short skip if (/\;$/) { # get rid of trailing ";" that aren't part of &Xuml; s/((\w|\&[a-z,A-Z]+\;)+)\;?/$1/; } if(/^&?[A-Z][^A-Z]*$/){ # "Someword" to "someword" tr/A-Z/a-z/; $freqlist{$_}++; }else{ # unusual case - index both variants $freqlist{$_}++; tr/A-Z/a-z/; $freqlist{$_}++; } } } } } } $file =~ tr/\n/ /s; # convert MAC and PC path separators to UNIX style slashes if($TYPE eq "MAC"){ $file =~ s|:|/|g; } if($TYPE eq "PC") { $file =~ s|\\|/|g; } # on a MAC add the leading slash if ($file =~ m/^[^\/]/) { $file = "/$file"; } $title =~ tr/\n/ /s; print INDEX "\@f $file\n"; print INDEX "\@t $title\n"; print INDEX "\@m $mtime\n"; foreach $w (sort keys(%freqlist)){ print INDEX "$freqlist{$w} $w\n"; ###print "4. $freqlist{$w} $w\n"; } undef %freqlist; close(fpin); } # iso2html - translate iso 8-bit characters to HTML # # Thanks to # Pierre Cormier (cormier.pierre@uqam.ca) # Université du Quebec Montreal sub iso2html { local($input)=@_; unless(defined($isohtml[0])){ foreach (0..191) { $isohtml[$_] = pack("C",$_);} $isohtml[hex('c0')] = 'À'; $isohtml[hex('c1')] = 'Á'; $isohtml[hex('c2')] = 'Â'; $isohtml[hex('c3')] = 'Ã'; $isohtml[hex('c4')] = 'Ä'; $isohtml[hex('c5')] = 'Å'; $isohtml[hex('c6')] = 'Æ'; $isohtml[hex('c7')] = 'Ç'; $isohtml[hex('c8')] = 'È'; $isohtml[hex('c9')] = 'É'; $isohtml[hex('ca')] = 'Ê'; $isohtml[hex('cb')] = 'Ë'; $isohtml[hex('cc')] = 'Ì'; $isohtml[hex('cd')] = 'Í'; $isohtml[hex('ce')] = 'Î'; $isohtml[hex('cf')] = 'Ï'; $isohtml[hex('d0')] = 'Ð'; $isohtml[hex('d1')] = 'Ñ'; $isohtml[hex('d2')] = 'Ò'; $isohtml[hex('d3')] = 'Ó'; $isohtml[hex('d4')] = 'Ô'; $isohtml[hex('d5')] = 'Õ'; $isohtml[hex('d6')] = 'Ö'; $isohtml[hex('d7')] = '×'; $isohtml[hex('d8')] = 'Ø'; $isohtml[hex('d9')] = 'Ù'; $isohtml[hex('da')] = 'Ú'; $isohtml[hex('db')] = 'Û'; $isohtml[hex('dc')] = 'Ü'; $isohtml[hex('dd')] = 'Ý'; $isohtml[hex('de')] = 'Þ'; $isohtml[hex('df')] = 'ß'; $isohtml[hex('e0')] = 'à'; $isohtml[hex('e1')] = 'á'; $isohtml[hex('e2')] = 'â'; $isohtml[hex('e3')] = 'ã'; $isohtml[hex('e4')] = 'ä'; $isohtml[hex('e5')] = 'å'; $isohtml[hex('e6')] = 'æ'; $isohtml[hex('e7')] = 'ç'; $isohtml[hex('e8')] = 'è'; $isohtml[hex('e9')] = 'é'; $isohtml[hex('ea')] = 'ê'; $isohtml[hex('eb')] = 'ë'; $isohtml[hex('ec')] = 'ì'; $isohtml[hex('ed')] = 'í'; $isohtml[hex('ee')] = 'î'; $isohtml[hex('ef')] = 'ï'; $isohtml[hex('f0')] = 'ð'; $isohtml[hex('f1')] = 'ñ'; $isohtml[hex('f2')] = 'ò'; $isohtml[hex('f3')] = 'ó'; $isohtml[hex('f4')] = 'ô'; $isohtml[hex('f5')] = 'õ'; $isohtml[hex('f6')] = 'ö'; $isohtml[hex('f7')] = '&DIVIS;'; $isohtml[hex('f8')] = 'ø'; $isohtml[hex('f9')] = 'ù'; $isohtml[hex('fa')] = 'ú'; $isohtml[hex('fb')] = 'û'; $isohtml[hex('fc')] = 'ü'; $isohtml[hex('fd')] = 'ý'; $isohtml[hex('fe')] = 'þ'; $isohtml[hex('ff')] = 'ÿ'; } local(@car) = split(//,$input); local($output); foreach (@car) { $output .= $isohtml[ord($_)]; } $output; }