DESCRIPTION
The following sample code for invindexer.plx and search.cgi can be used to create a simple search engine. It requires the html presentation of the US Constitution included in the distribution for KinoSearch, under "t/us_constitution".Note that a proper indexer for html documents would not rely on quick-n-dirty regular expressions for stripping tags, as this one does for the sake of brevity --- it would use a dedicated parsing module such as HTML::Parser.
invindexer.plx
#!/usr/bin/perl use strict; use warnings; use File::Spec; use KinoSearch::InvIndexer; use KinoSearch::Analysis::PolyAnalyzer; ### In order for invindexer.plx to work correctly, you must modify ### $source_dir, $path_to_invindex, and possibly $base_url. ### ### $source_dir must lead to the directory containing the US ### Constitution html files. ### ### $path_to_invindex is the future location of the invindex. ### ### $base_url should reflect the location of the us_constitution directory ### when accessed via a web browser. my $source_dir = ''; my $path_to_invindex = ''; my $base_url = '/us_constitution'; opendir( my $source_dh, $source_dir ) or die "Couldn't opendir '$source_dir': $!"; my @filenames = grep {/\.html/} readdir $source_dh; closedir $source_dh or die "Couldn't closedir '$source_dir': $!"; ### STEP 1: Choose an Analyzer. my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en', ); ### STEP 2: Create a InvIndexer object. my $invindexer = KinoSearch::InvIndexer->new( analyzer => $analyzer, invindex => $path_to_invindex, create => 1, ); ### STEP 3: Define fields. $invindexer->spec_field( name => 'title' ); $invindexer->spec_field( name => 'bodytext', vectorized => 1, ); $invindexer->spec_field( name => 'url', indexed => 0, ); foreach my $filename (@filenames) { next if $filename eq 'index.html'; my $filepath = File::Spec->catfile( $source_dir, $filename ); open( my $fh, '<', $filepath ) or die "couldn't open file '$filepath': $!"; my $content = do { local $/; <$fh> }; ### STEP 4: Start a new document. my $doc = $invindexer->new_doc; $content =~ m#<title>(.*?)</title>#s or die "couldn't isolate title in '$filepath'"; my $title = $1; $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s or die "couldn't isolate bodytext in '$filepath'"; my $bodytext = $1; $bodytext =~ s/<.*?>/ /gsm; # quick and dirty tag stripping ### STEP 5: Set the value for each field. $doc->set_value( url => "$base_url/$filename" ); $doc->set_value( title => $title ); $doc->set_value( bodytext => $bodytext ); ### STEP 6 Add the document to the invindex. $invindexer->add_doc($doc); ### STEP 7 Repeat steps 3-5 for each document in the collection. } ### STEP 8 Finalize the invindex. $invindexer->finish;
search.cgi
#!/usr/bin/perl -T use strict; use warnings; use CGI; use List::Util qw( max min ); use POSIX qw( ceil ); use KinoSearch::Searcher; use KinoSearch::Analysis::PolyAnalyzer; use KinoSearch::Highlight::Highlighter; my $cgi = CGI->new; my $q = $cgi->param('q'); my $offset = $cgi->param('offset'); my $hits_per_page = 10; $q = '' unless defined $q; $offset = 0 unless defined $offset; ### In order for search.cgi to work, $path_to_invindex must be modified so ### that it points to the invindex created by invindexer.plx, and ### $base_url may have to change to reflect where a web-browser should ### look for the us_constitution directory. my $path_to_invindex = ''; my $base_url = '/us_constitution'; ### STEP 1: Specify the same Analyzer used to create the invindex. my $analyzer = KinoSearch::Analysis::PolyAnalyzer->new( language => 'en', ); ### STEP 2: Create a Searcher object. my $searcher = KinoSearch::Searcher->new( invindex => $path_to_invindex, analyzer => $analyzer, ); ### STEP 3: Feed a query to the Search object. my $hits = $searcher->search($q); ### STEP 4: Arrange for highlighted excerpts to be created. my $highlighter = KinoSearch::Highlight::Highlighter->new( excerpt_field => 'bodytext' ); $hits->create_excerpts( highlighter => $highlighter ); ### STEP 5: Process the search. $hits->seek( $offset, $hits_per_page ); ### STEP 6: Format the results however you like. # create result list my $report = ''; while ( my $hit = $hits->fetch_hit_hashref ) { my $score = sprintf( "%0.3f", $hit->{score} ); $report .= qq| <p> <a href="$hit->{url}"><strong>$hit->{title}</strong></a> <em>$score</em> <br> $hit->{excerpt} <br> <span class="excerptURL">$hit->{url}</span> </p> |; } $q = CGI::escapeHTML($q); # display info about the number of hits, paging links my $total_hits = $hits->total_hits; my $num_hits_info; if ( !length $q ) { # no query, no display $num_hits_info = ''; } elsif ( $total_hits == 0 ) { # alert the user that their search failed $num_hits_info = qq|<p>No matches for <strong>$q</strong></p>|; } else { # calculate the nums for the first and last hit to display my $last_result = min( ( $offset + $hits_per_page ), $total_hits ); my $first_result = min( ( $offset + 1 ), $last_result ); # display the result nums, start paging info $num_hits_info = qq| <p> Results <strong>$first_result-$last_result</strong> of <strong>$total_hits</strong> for <strong>$q</strong>. </p> <p> Results Page: |; # calculate first and last hits pages to display / link to my $current_page = int( $first_result / $hits_per_page ) + 1; my $last_page = ceil( $total_hits / $hits_per_page ); my $first_page = max( 1, ( $current_page - 9 ) ); $last_page = min( $last_page, ( $current_page + 10 ) ); # create a url for use in paging links my $href = $cgi->url( -relative => 1 ) . "?" . $cgi->query_string; $href .= ";offset=0" unless $href =~ /offset=/; # generate the "Prev" link; if ( $current_page > 1 ) { my $new_offset = ( $current_page - 2 ) * $hits_per_page; $href =~ s/(?<=offset=)\d+/$new_offset/; $num_hits_info .= qq|<a href="$href"><= Prev</a>\n|; } # generate paging links for my $page_num ( $first_page .. $last_page ) { if ( $page_num == $current_page ) { $num_hits_info .= qq|$page_num \n|; } else { my $new_offset = ( $page_num - 1 ) * $hits_per_page; $href =~ s/(?<=offset=)\d+/$new_offset/; $num_hits_info .= qq|<a href="$href">$page_num</a>\n|; } } # generate the "Next" link if ( $current_page != $last_page ) { my $new_offset = $current_page * $hits_per_page; $href =~ s/(?<=offset=)\d+/$new_offset/; $num_hits_info .= qq|<a href="$href">Next =></a>\n|; } # finish paging links $num_hits_info .= "</p>\n"; } # blast it all out print "Content-type: text/html\n\n"; print <<END_HTML; <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-type" content="text/html;charset=ISO-8859-1"> <link rel="stylesheet" type="text/css" href="$base_url/uscon.css"> <title>KinoSearch: $q</title> </head> <body> <div id="navigation"> <form id="usconSearch" action=""> <strong> Search the <a href="$base_url/index.html">US Constitution</a>: </strong> <input type="text" name="q" id="q" value="$q"> <input type="submit" value="=>"> <input type="hidden" name="offset" value="0"> </form> </div><!--navigation--> <div id="bodytext"> $report $num_hits_info <p style="font-size: smaller; color: #666"> <em>Powered by <a href="http://www.rectangular.com/kinosearch/"> KinoSearch </a> </em> </p> </div><!--bodytext--> </body> </html> END_HTML
COPYRIGHT
Copyright 2005-2009 Marvin HumphreyLICENSE, DISCLAIMER, BUGS, etc.
See KinoSearch version 0.165.