#!/usr/local/bin/perl
#
# ice-form.pl -- cgi compliant ICE search interface // Jun 24 1996
#
# ICE Version 1.31
# (C) Christian Neuss (http://www.informatik.th-darmstadt.de/~neuss)
#--- start of configuration --- put your changes here ---
# Title or name of your server:
# Example: local($title)="ICE Indexing Gateway";
local($title)="ICE Indexing Gateway";
# search directories to present in the search dialog
# Example:
# local(@directories)=(
# "Image Communication Information Board (/icib)",
# "WISE (/some/where/wise)"
# );
# local(@directories)=(
# "Image Communication Information Board (/icib)",
# "WISE (/www/projects/wise)",
# "Multimedia Survey (/www/projects/mms)",
# "Department A2 (/www/igd-a2)",
# "Department A8 (/www/igd-a8)",
# "Department A9 (/www/igd-a9)",
# "DZSIM (/www/projects/dzsim)",
# "CSCW Laboratory (/www/projects/cscw-lab)",
# "Software Catalog (/www/projects/sw-catalog)",
# "WWW-Schulung (/www/igd-a3/schulung)",
# "DZSIM (/www/projects/dzsim)",
# "ZGDV User Interface GROUP (/www/zgdv-uig)"
# );
# Location of the indexfile:
# Note: under Windows or Windows NT, add the drive letter
# Example: $indexfile='/usr/local/etc/httpd/index/index.idx';
$indexfile='/www/htdocs/domains/domain3/00095/www.wildboar.net/webdocs/cgi/ice/ice1-31/index.idx';
# Location of the thesaurus data file:
# Example: $thesfile='/igd/a3/home1/neuss/Perl/thes.dat';
# $thesfile='/igd/a3/home1/neuss/Perl/thes.dat';
# URL Mappings (a.k.a Aliases) that your server does
# map "/" to some path to reflect a "document root"
# Example
# %urltopath = (
# '/projects', '/usr/stud/proj',
# '/people', '/usr3/webstuff/staff',
# '/', '/usr3/webstuff/documents',
# );
#
%urltopath = (
'/www/htdocs/domains/domain3/00095/www.wildboar.net/webdocs',
);
#--- end of configuration --- you don't have to change anything below ---
# if this script is called up "by hand", run a test
unless($ENV{"SCRIPT_NAME"}){
local($word) = ($#ARGV==-1) ? "the" : $ARGV[0] ;
print "You have called the ice forms interface manually.\n";
print "Optionally, provide search word as an argument.\n";
print "Test mode: search for \"$word\"\n";
print "--------\n";
$orig="$word @ /";
$foo=&getquery($orig);
print $foo;
exit;
}
# do the real work, but trap any errors
eval '&main';
# if an error has occured, log it to stdout
if($@){
&send_header("Error in Script"); # just in case
print "$@\n
';
print "$title
";
print "
This searchable archive was implemented with the
ICE search engine
';
print "\n$title
\n";
# parse forms result and store it in an associative array
%forms=&cgiparse();
$pquery = $query = $forms{KEYWORDS};
$context = $forms{CONTEXT};
$context = $forms{CONTEXT};
if($context =~ m:\(([^)]*)\):) {
$context=$1;
}else{
$context="";
}
$thesaurus = $forms{THESAURUS};
$substring = $forms{SUBSTRING};
$days = $forms{DAYS};
if(length($days)>0){
$pquery.=" -D $days";
}
if(length($thesaurus)>0){
$pquery.=" -T";
}
if(length($substring)>0){
$pquery.=" -S";
}
if(length($context)>0){
$pquery.=" @ $context";
}
($err,$page) = &getquery($pquery);
if($err){
print "Query was: $query
\n";
print "Problem: $err\n";
print "";
}
# print the CGI script header
sub send_header {
local($title)=@_;
print "Content-type: text/html\n\n";
print "
";
print $title;
print "\n";
}
# display the Forms interface
sub send_index {
local($scriptname) = $ENV{"SCRIPT_NAME"};
print '
END
}
# parse data from CGI request and store it as name/value pairs
sub cgiparse {
if ($ENV{'REQUEST_METHOD'} eq "POST") {
read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'});
} else {
$buffer = $ENV{'QUERY_STRING'};
}
local(@query_strings) = split("&", $buffer);
foreach $q (@query_strings) {
$q =~ s/\+/ /g;
($attr, $val) = split("=", $q);
$val =~ s/%/\n%/g;
local($tmpval);
foreach (split("\n",$val)){
if(m:%(\w\w):){
local($binval) = hex($1);
if(($binval>0)&&($binval<256)){
local($htmlval) = pack("C",$binval);
s/%$1/$htmlval/;
}
}
$tmpval .= $_;
}
$forms{$attr} = $tmpval;
}
%forms;
}
sub main {
# if content_length is zero and query string is empty
if (($ENV{CONTENT_LENGTH}==0) &&
(length($ENV{"QUERY_STRING"})==0)){
# we're not decoding a form yet => send the form
&send_header("$title");
&send_index();
return;
}
&send_header("$title: ICE Query Result");
print '
";
return undef;
}
print "Preferences set for this query:\n";
print "
\n";
if ($query) {
print "- query was \"$query\"\n";
}
if($context){
print "
- context was set to $context.\n";
}
if($thesaurus){
print "
- use of thesaurus turned on\n";
}
if($substring){
print "
- substring matching turned on\n";
}
if($days){
print "
- changed in the last $days days\n";
}
print "
\n";
if($page){
print "
The index contains the following\n";
print "items relevant to the query\n";
print "$page\n";
}else{
print "
Nothing found.\n";
}
print "\n";
}
########### begin of ice kernel functions, formerly ice.pl ##########
sub getquery{
local($query)=@_;
local($page)="
\n";
local($hitcount);
$query = &iso2html($query);
local($err,$context,$thes,$substr,$bool,$days,@ret)=&parsequery($query);
if($err){
return $err; #XXX hm..
}
local(@tmplist)= &getindex($thes,$bool,$days,$substr,@querystring);
if($context){
# translate virtual<->physical path
local($v,$p)=($context,&translate($context));
@tmplist=grep(/^[^\n]*\n$p/,@tmplist);
###print "translate context from \n$v to \n$p\n";
}
for $w (@tmplist){
local($freq,$file,$title,@hits)=split(/\n/,$w);
local($virt)=&translateback($file);
###print "translate docpath from \n$file to \n$virt\n";
$w=join("\n",$freq,$virt,$title,@hits);
}
sub downbynumber {$b <=> $a;}
@tmplist = sort downbynumber @tmplist;
foreach $w (@tmplist){
local($freq,$file,$title,@hits)=split(/\n/,$w);
$hitcount++;
###print "$freq,$title\n";
unless($title) { $title="(NO TITLE)"; }
$page .= "- $title
\n";
$page .= "$file
\n";
foreach $line (@hits){
$page .= "$line
\n";
}
}
if($hitcount>0){
$page .= "
";
}else{
$page = "";
}
("",$page); # return;
}
# parse query
sub parsequery{
local($query)=@_;
local($context,$thesaurus,$substr);
# preprocess whitespace and discard spaces after @ and -D
$query =~ tr/ \t/ /s;
$query =~ s/@ /@/g;
$query =~ s/-D /-D/g;
$query =~ s/^-D/ -D/g;
$_=$query;
# "optional URL context as @-sign"
if(m:^([^@]*)\s+@(.*)$:){
$context=$2;
$_=$1;
} ### to be added: "IN"
while(m:\s+-[SDT]\d*$:){
# "turn on "global" thesaurus" by adding -T"
if(m:^(.*)\s+-T$:){
$thesaurus="y";
# print "turn on thesaurus\n";
$_=$1;
}
# "turn on matching substrings by adding -S"
if(m:^(.*)\s+-S$:){
$substr="y";
# print "turn on substring matching\n";
$_=$1;
}
# "turn on modified since n days" by adding -D"
if(m:^(.*)\s+-D(\d+)$:){
$days=$2;
# print "turn on modified since $days\n";
$_=$1;
}
}
# print "remaining query $_\n";
@list=split(/ /,$_);
$expectword="y" unless($days && $#list==-1);
foreach $w (@list){
$_ = $w;
tr/A-Z/a-z/;
if(/^and$/) {
if($expectword) {$err="$w"; last;}
$expectword="y";
$bool .= "&";
}elsif(/^or$/){
if($expectword) {$err="$w"; last;}
$expectword="y";
$bool .= "+";
}else{
### unless($expectword) {$err="$w"; last;}
# new: and is optional
unless($expectword) {
$bool .= "&";
}
$expectword="";
push(@querystring,$w);
}
}
if($expectword){
return ("syntax error in query: must end with keyword!");
}
if($err){
return ("syntax error in query near '$err'!");
}
#print"c=$context\nt=$thesaurus\nl=$levenshtein\n";
#print "b=$bool\nd=$days\nq=@querystring\n";
return("",$context,$thesaurus,$substr,$bool,$days,@querystring);
}
# get index entries matching query
sub getindex{
local($thes,$bool,$days,$substr,@query)=@_;
local(@list,$count,$item,$w,@wordnum,$grepexpr,$ret);
local($limit);
if($days){
$limit=time()-(60*60*24*$days) unless($days==0);
}
foreach $item (@query){
++$count;
local($w);
$_=$item;
local($flag)=$thes;
if (/{(.*)}/) {
$_ = $1;
$flag="y";
}
# convert e.g. "Picture" to "picture"
if(/^[A-Z][^A-Z]*$/){
tr/A-Z/a-z/;
}
# evaluate thesaurus
if ($flag) {
push (@keywords,$_);
$wordnum{$_}=$count;
local(@synonyms)=split(/\n/,&thesread($thesfile,$_));
foreach $w (@synonyms){
push (@keywords,$w);
$wordnum{$w}=$count;
}
} else {
$w=$_;
push (@keywords,$w);
$wordnum{$w}=$count;
}
}
$grepexpr="^@";
foreach (@keywords) { $grepexpr.="|$_"; }
local($timstr,$pat);
open(fpin,"<$indexfile") || die "$!";
while(){
next unless (/$grepexpr/o);
if(/^@/o){
# thanks to John Harper!
if(/\@f\s(.*)$/o) { $path =$1; next; }
if(/\@t\s(.*)$/o) { $title =$1; next; }
if(/\@m\s(.*)$/o) {
$mtime=$1;
$timstr = &timetostr($mtime);
$title = "$title (last change $timstr)";
if(($#keywords==-1) && ($limit==0 || $mtime>=$limit)){
$entry=join("\n",$path,"","","",$title);
push(@list,$entry);
}
next;
}
} else {
if($limit==0 || $mtime>=$limit){
foreach $w (@keywords){
$pat = $substr ? ".*$w.*" : $w;
if(/(\d+)\s+($pat)$/){
$freq=$1;
$word=$2;
if(length($word)>0){
$token=$wordnum{$w};
$entry=join("\n",$path,$token,$word,$freq,$title);
push(@list,$entry);
}
}
}
}
}
}
### print "list has $#list elements\n";
if($#keywords>=0){
# if keywords given evaluate expression and stuff
@list=sort(@list);
&evaluateexpr($bool,@list); #return
}else{
# else just reorder
foreach $w (@list){
($path,$token,$word,$freq,$title)=split(/\n/,$w);
$w=join("\n","1",$path,$title);
}
@list;
}
}
sub evaluateexpr {
local($bool,@list)=@_;
local($lastpath,$lasttitle,$retval);
local($relevance,%wordlist);
local(@reslist);
# let's do some initialization
($path,$token,$word,$freq,$title)=split(/\n/,$list[0]);
$lastpath=$path;
$lasttitle=$title;
%wordlist=();
local($fw);
if($word ne $query[$token-1]){
$fw="$freq ($query[$token-1])";
}else{
$fw="$freq";
}
$wordlist{$word}=$fw;
foreach $i (0 .. $count-1){
$exprarray[$i]=0;
}
# loop over all entries in list
foreach $i (0 .. $#list+1){ # sic!
if($i <= $#list){
($path,$token,$word,$freq,$title)=split(/\n/,$list[$i]);
}
# if path has changed -> compute expression
if(($lastpath ne $path) || ($i==$#list+1)) {
$expr=join('',@exprarray);
###print "path changed, call $exprarray ($expr) and ($bool)\n";
local($ret)=&booleval($bool,$expr);
if($ret==1){
local($w);
$retval .= "$relevance\n"; # here's a hit
$retval .= "$lastpath\n"; # here's a hit
$retval .= "$lasttitle\n"; # here's a hit
foreach $w (sort keys(%wordlist)){
$retval .= "\"$w\" $wordlist{$w}\n";
}
push(@reslist,$retval);
$retval="";
}
if($i==$#list+1){
last; # leave loop
}
$lastpath=$path;
$lasttitle=$title;
$relevance=$freq;
%wordlist=();
if($word ne $query[$token-1]){
$fw="$freq ($query[$token-1])";
}else{
$fw="$freq";
}
$wordlist{$word}=$fw;
foreach $i (0 .. $count-1){
$exprarray[$i]=0;
}
$exprarray[$token-1]=1;
}else{
$exprarray[$token-1]=1;
$relevance += $freq;
if($word ne $query[$token-1]){
$fw="$freq ($query[$token-1])";
}else{
$fw="$freq";
}
$wordlist{$word}=$fw;
}
}
@reslist; # return;
}
# compute boolean expressions
# e.g. to compute "1 or 0 and 1" use booleval("101","+&")
sub booleval {
local($arg1,$arg2)=@_;
local($t1,$t2,$i,$op,$opers,$terms);
# print "bool($arg1,$arg2)\n"; #XXX
@opers=split(//,$arg1);
@terms=split(//,$arg2);
$t1=$terms[0];
if($#terms==0){ # only one term given
return $t1;
}
for $i (0..($#terms-1)){
$t2=$terms[$i+1];
$op=$opers[$i];
if($op eq "+"){
if($t1!=0){
return 1;
} else {
$t1=$t2;
}
}else{
$t1*=$t2;
}
}
return $t1;
}
# evaulate a thesaurus file for a given term
sub thesread {
local($thesfile,$word)=@_;
local($last,$result,$line)="";
local($allowed)="EQ|AB|UF";
unless (open(fpin,$thesfile)) {
print STDERR "Cannot open thesaurus file $thesfile\n";
return undef;
}
while(){
$line++;
if (m:^(\S+)\s+$:) {
$last=$1;
}elsif((m:^\s+($allowed)\s+(\S+):)&&($last eq $word)) {
$result .= "$2\n";
}
}
close(fpin);
$result;
}
# translate URL to physical
sub translate {
local($url)=@_;
local($docroot,$aliasdone);
$_=$url;
s|/+$||; # strip off a trailing "/"
#print "Was $_\n";
foreach $key (keys(%urltopath)){
if($key eq "/"){
$docroot=$urltopath{$key};
}
if( ($key ne "/") && (/^$key/) ){
s/^$key/$urltopath{$key}/;
$aliasdone="y";
#print "replacing $key with $urltopath{$key}\n";
}
}
if(!$aliasdone && $docroot){
$_ = $docroot.$_;
}
#print "Now is $_\n";
$_;
}
# translate physical to URL
sub translateback {
local($url)=@_;
local($docroot,$aliasdone);
$_=$url;
s/(.*)\/$/\1/; # strip off a trailing "/"
#print "Was $_\n";
foreach $key (keys(%urltopath)){
if($key eq "/"){
$docroot=$urltopath{$key};
}else{
if(/^$urltopath{$key}/){
s/^$urltopath{$key}/$key/;
$aliasdone="y";
#print "replacing $urltopath{$key} with $key\n";
}
}
}
if(!$aliasdone && $docroot){
s/$docroot//;
}
#print "Now is $_\n";
$_;
}
# convert time to string
sub timetostr{
local($time)=@_;
local($sec,$min,$hour,$mday,$mon,$year,$wday,@dontcare)=localtime($time);
local($weekday)=(Sun,Mon,Tue,Wed,Thu,Fri,Sat)[$wday];
local($month)=(Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec)[$mon];
local($result)="$weekday $mday $month $year";
$result;
}
# iso2html - translate iso 8 bit characters to HTML
#
# Thanks to
# Pierre Cormier (cormier.pierre@uqam.ca)
# Universite du Quebec Montreal
sub iso2html {
local($input)=@_;
unless($isohtml[0]){
foreach (0..191) { $isohtml[$_] = pack("C",$_);}
$isohtml[hex('c0')] = 'À';
$isohtml[hex('c1')] = 'Á';
$isohtml[hex('c2')] = 'Â';
$isohtml[hex('c3')] = 'Ã';
$isohtml[hex('c4')] = 'Ä';
$isohtml[hex('c5')] = 'Å';
$isohtml[hex('c6')] = 'Æ';
$isohtml[hex('c7')] = 'Ç';
$isohtml[hex('c8')] = 'È';
$isohtml[hex('c9')] = 'É';
$isohtml[hex('ca')] = 'Ê';
$isohtml[hex('cb')] = 'Ë';
$isohtml[hex('cc')] = 'Ì';
$isohtml[hex('cd')] = 'Í';
$isohtml[hex('ce')] = 'Î';
$isohtml[hex('cf')] = 'Ï';
$isohtml[hex('d0')] = 'Ð';
$isohtml[hex('d1')] = 'Ñ';
$isohtml[hex('d2')] = 'Ò';
$isohtml[hex('d3')] = 'Ó';
$isohtml[hex('d4')] = 'Ô';
$isohtml[hex('d5')] = 'Õ';
$isohtml[hex('d6')] = 'Ö';
$isohtml[hex('d7')] = '×';
$isohtml[hex('d8')] = '&Ostroke;';
$isohtml[hex('d9')] = 'Ù';
$isohtml[hex('da')] = 'Ú';
$isohtml[hex('db')] = 'Û';
$isohtml[hex('dc')] = 'Ü';
$isohtml[hex('dd')] = 'Ý';
$isohtml[hex('de')] = 'Þ';
$isohtml[hex('df')] = 'ß';
$isohtml[hex('e0')] = 'à';
$isohtml[hex('e1')] = 'á';
$isohtml[hex('e2')] = 'â';
$isohtml[hex('e3')] = 'ã';
$isohtml[hex('e4')] = 'ä';
$isohtml[hex('e5')] = 'å';
$isohtml[hex('e6')] = 'æ';
$isohtml[hex('e7')] = 'ç';
$isohtml[hex('e8')] = 'è';
$isohtml[hex('e9')] = 'é';
$isohtml[hex('ea')] = 'ê';
$isohtml[hex('eb')] = 'ë';
$isohtml[hex('ec')] = 'ì';
$isohtml[hex('ed')] = 'í';
$isohtml[hex('ee')] = 'î';
$isohtml[hex('ef')] = 'ï';
$isohtml[hex('f0')] = 'ð';
$isohtml[hex('f1')] = 'ñ';
$isohtml[hex('f2')] = 'ò';
$isohtml[hex('f3')] = 'ó';
$isohtml[hex('f4')] = 'ô';
$isohtml[hex('f5')] = 'õ';
$isohtml[hex('f6')] = 'ö';
$isohtml[hex('f7')] = '&DIVIS;';
$isohtml[hex('f8')] = '&ostroke;';
$isohtml[hex('f9')] = 'ù';
$isohtml[hex('fa')] = 'ú';
$isohtml[hex('fb')] = 'û';
$isohtml[hex('fc')] = 'ü';
$isohtml[hex('fd')] = 'ý';
$isohtml[hex('fe')] = 'þ';
$isohtml[hex('ff')] = 'ÿ';
}
local(@car) = split(//,$input);
local($output);
foreach (@car) {
$output .= $isohtml[ord($_)];
}
$output;
}