3 # DBC-OPENSEARCH gateway
4 # Z39.50 server (and SRU and...) and a client to DBC's opensearch
6 # Based on DBC's Primo gateway
8 # Supports sortby with one argument with or without /ascending/descending
9 # These are translated to parameters to the opensearch server. The rest of
10 # the query is passed on verbatim.
12 # See DBC-45 in our Jira
15 # Heikki Levanto, Index Data
17 # (C) Copyright 2012-2013 Index Data
19 # Example opensearch url (split for readability)
21 #http://opensearch.addi.dk/next_2.2/?
28 # facets.numberOfTerms=5&
29 # facets.facetName=facet.creator&
30 # facets.facetName=facet.type
32 #http://opensearch.addi.dk/next_2.2/?action=search&query=hammer&agency=100200&profile=test&start=1&stepValue=3&facets.numberOfTerms=5&facets.facetName=facet.creator&facets.facetName=facet.type
34 # A simple way to test the script:
35 # ./dbc-opensearch-gw.pl -1 &
36 # zoomsh "open @:9999/default,agency=100200&profile=test" \
37 # "find cql:hamlet sortby creator" "show 0" "quit")
44 use Net::Z3950::SimpleServer;
52 my $gwversion = "1.5";
54 ############# Configuration
55 my $configfilename = "dbc-opensearch-gw.cfg";
58 # The following can be overwritten in the config file
59 # It consists of namevalue pairs, separated by a colon
60 # The names are like the variable names below, without the $
63 # White space and #comments are ignored
68 my $objectformat = {};
69 $baseurl{'Default'} = "http://opensearch.addi.dk/2.2/";
70 $constantparams{'Default'} = "action=search&collectionType=manifestation";
71 my $chunksize = 10; # initial, grows to max present req.
72 my $prettyxml = 0; # 1 for formatting XML a bit, 0 for not
79 # Magic value to tell that a term is not to be included in the query
80 # when it contained sort stuff that has been extracted in the session
81 my $magic_sort_indicator = " MAGIC SORT INDICATOR ";
83 ############ Config file
87 my $cfile = $options->{CONFIG};
88 # Override config filename with command line value
89 if ($cfile ne "default-config") {
90 $configfilename = $cfile;
92 if (! -r $configfilename) {
93 # die if we explicit gave a config file and it isn't present
94 die "Error opening configuration file given by -c $configfilename: $!\n" if ($cfile ne "default-config");
95 yazlog("WARN: Could not open config file $configfilename. Running with defaults.");
98 yazlog("Reading configuration file $configfilename");
99 open(my $F,$configfilename)
100 or die "Error opening config file $configfilename: $!\n";
101 my $database = "Default";
105 s/\#.*$//; # remove comments
106 s/\s*$//; # and trailing spaces
107 if ( /^ *([^ :]+) *: *(.*)$/ ) {
108 yazlog("Config setting $1 : $2") if $debug;
109 if ($1 eq "baseurl") {
110 $baseurl{$database} = $2;
111 } elsif ($1 eq "urlpath") {
112 die "$configfilename:$line: urlpath not supported anymore. Use baseurl";
113 } elsif ($1 eq "constantparams") {
114 $constantparams{$database} = $2;
115 } elsif ($1 eq "chunksize") {
117 } elsif ($1 eq "prettyxml") {
119 } elsif ($1 eq "debug") {
121 } elsif ($1 eq "test_data") {
123 } elsif ($1 eq "op_and") {
125 } elsif ($1 eq "op_or") {
127 } elsif ($1 eq "op_not") {
129 } elsif ($1 eq "objectformat") {
130 $objectformat->{$database} = $2;
131 } elsif ($1 eq "fields") {
133 if ($fname !~ /^\// ) {
134 $fname = dirname($configfilename) . "/" . $fname;
136 $fields->{$database} = readfields($fname);
137 } elsif ($1 eq "database") {
138 $constantparams{$2} = $constantparams{$database};
139 $baseurl{$2} = $baseurl{$database};
142 die "$configfilename:$line: Bad directive: $1\n";
147 die "$configfilename:$line: Bad syntax\n";
151 # Only log if debugging, as these are displayed before
152 # yaz processes the command line, and opens log file from -l
153 yazlog("Opensearch gateway $gwversion starting") if ($debug);
154 yazlog("Loaded config from $cfile") if ($debug);
160 yazlog("Reading fields file $fname");
161 open(my $F, $fname) or die "Error open fields file $fname\n";
165 s/\#.*$//; # remove comments
166 s/\s*$//; # and trailing spaces
167 my @list = split(/\s+/,$_);
168 my $cqlfield = $list[0];
169 if (defined($cqlfield) && $cqlfield =~ /\./) {
170 if (defined($fr->{$cqlfield})) {
171 print "$cqlfield already defined\n" if $debug;
174 if ( /^([^,]+),([^=]+)=(.*)$/ ) {
175 $fr->{$cqlfield}->{$2} = $3;
178 $fr->{$cqlfield}->{set} = $s;
179 } elsif ($s =~ /^bib-?1$/i ) {
180 $fr->{$cqlfield}->{set} = '1.2.840.10003.3.1';
181 } elsif ($s =~ /^dan-?1$/i ) {
182 $fr->{$cqlfield}->{set} = '1.2.840.10003.3.15';
183 } elsif ($s =~ /^dbc-?1$/i ) {
184 $fr->{$cqlfield}->{set} = '1.2.840.10003.3.1000.105.1';
186 die "Unknown attribute set $s\n";
188 } elsif ( /^([^=]+)=(.*)$/ ) {
189 $fr->{$cqlfield}->{$1} = $2;
198 foreach my $f (keys %{$fr}) {
200 foreach my $s (keys %{$fr->{$f}}) {
201 my $x = $fr->{$f}->{$s};
210 my $filename = shift;
212 my $chunksize = shift;
213 $filename = $filename . "_" . $start . "_" . $chunksize . ".xml";
214 yazlog("WARN: fetching test data only: $filename");
216 or die "Error opening test data file $filename: $!\n";
221 yazlog("Loaded test data $filename");
225 ############## Helpers
231 Net::Z3950::SimpleServer::yazlog($msg);
235 # Set the error items in the handle, and return an empty string
241 my $logmsg = shift; # optional
242 if ( $href ) { # defensive coding
243 $href->{ERR_CODE}=$errno;
244 $href->{ERR_STR}=$errtxt;
246 yazlog("ERROR $errno: $errtxt");
251 # Dump a handle, without the full record store
253 return unless $debug;
256 yazlog("Dumphandle: " . $msg);
257 my $session = $href->{HANDLE};
258 my $recs = $session->{records};
259 $session->{records} = "<<< records omitted>>>";
260 yazlog(Dumper($href));
261 $session->{records} = $recs;
264 ############## http client
266 # Fetch a page from the given URL
270 my $session = $href->{HANDLE};
271 my $ua = new LWP::UserAgent;
272 if ( ! $session->{cookies} ) {
273 $session->{cookies} = HTTP::Cookies->new( );
274 yazlog("Initialized a new cookie jar") if ($debug);
276 $ua->cookie_jar( $session->{cookies} );
277 my $req = new HTTP::Request GET => $url;
278 my $res = $ua->request($req);
279 if ( ! $res->is_success ) {
280 return err($href, 2, #temporary system error
281 "HTTP error from opensearch: ".$res->code. ": " .$res->message,
282 "fetching " . $url );
284 my $content = $res->content;
285 yazlog( "Received " . length($content). " bytes from $url");
286 if ( !utf8::valid($content) ) {
287 yazlog("The data is NOT VALID utf-8!!");
288 # Could return an error here, but probably better to limp along
290 # Force Perl to think the content as being utf-8
291 # If we get bad utf-8 data, things may fail in strange ways
292 # But without this, Perl assumes byte data, and helpfully
293 # converts it into utf-8, resulting in double-encoding.
295 Encode::_utf8_on($content);
296 # TODO - Check the http content-type header to see if we really got utf-8!
301 # Get number of records from opensearch.
302 # Detects some simple error codes
303 # Returns a XPathContext that has the actual document, and some namespaces
304 # defined. It can be used for finding nodes.
305 # Or an empty string to indicate errors
306 sub opensearchclient {
308 my $startrec = shift;
310 my $session = $href->{HANDLE};
311 my $query = $session->{query};
312 my $numrecs = $session->{chunksize};
313 my $dbname = $session->{dbbase};
314 my $extraargs = $session->{dbargs};
315 my $sort = $session->{sort};
317 my $urlparams = "?" . $constantparams{$dbname} . #all after '?'
318 "&start=$startrec". "&stepValue=$numrecs";
319 if (defined($session->{comp})) {
320 $urlparams .= "&objectFormat=" . $session->{comp};
323 $urlparams .= "&sort=$sort";
325 my $burl = $baseurl{$dbname};
326 yazlog("initial url parts: $burl $urlparams $query")
328 while ( $extraargs =~ /([^ &=]+)=([^ &]+)&?/g ) {
329 my $k = uri_unescape($1);
330 my $v = uri_unescape($2);
331 yazlog("Looking at extra parameter '$k' = '$v'") if $debug;
332 if ( $k eq "host" ) {
333 $burl = "http://" . $v. "/";
334 yazlog("Replaced host, got baseurl '$burl' ") if $debug;
335 } elsif ( $k eq "gwdebug" ) {
336 yazlog("Setting debug to $v because of a databasename parameter")
339 } elsif ( $urlparams =~ s/([?&])($k)=([^ &]+)/$1$k=$v/ ) {
340 yazlog("Replaced '$k': was '$3' is now '$v' ") if $debug;
342 $urlparams .= "&$k=$v";
343 yazlog("Appended '$k' = '$v'") if $debug;
346 yazlog("dbname: $dbname");
347 yazlog("final url parts: $burl $urlparams $query")
349 my $url = $burl . $urlparams . $query;
350 yazlog("final url: $url")
355 $page = fetchpage($href, $url);
358 $page = read_test_data($test_data, $startrec, $numrecs);
365 eval { $xmldom = XML::LibXML->load_xml(string => $page); };
367 return err( $href,100, #unspecified error
368 "Received bad XML from Opensearch: $@ ",
369 substr( $page,0,200 )."...");
371 my $xml = XML::LibXML::XPathContext->new($xmldom);
372 $xml->registerNs('os', 'http://oss.dbc.dk/ns/opensearch');
375 my $err = $xml->findvalue('//os:searchResponse/os:error');
377 return err( $href, 2, #temporary system error
378 "Error from Opensearch: " . $err,
379 substr( $page,0,400 )."...");
384 # Extract the hits into the cache in the session
388 my $session = $href->{HANDLE};
392 foreach my $rec ( $xml->findnodes('//os:searchResult') ) {
393 my $recno = $xml->findvalue('os:collection/os:resultPosition',$rec) ;
395 return err( $href, 2, #temporary system error
396 "Got a bad record from opensearch (no resultPosition)" );
398 $first = $recno unless ($first);
400 # Clone the node, so we get namespace definitions too
401 my $clone = $rec->cloneNode(1);
402 my $comp = $session->{comp};
403 $session->{records}->{$comp}->[$recno] = $clone->toString($prettyxml);
404 yazlog("Doc $recno: " .
405 length($session->{records}->{$comp}->[$recno]) . " bytes" )
408 yazlog("Extracted records $first - $last") if $debug;
412 # extract facets from the xml into the session, in a form that can
413 # be returned directly in the searchresponse.
417 my $session = $href->{HANDLE};
419 bless $zfacetlist, 'Net::Z3950::FacetList';
423 foreach my $facetnode ( $xml->findnodes('//os:facetResult/os:facet') ) {
424 #yazlog("Got facet " . $facetnode );
425 my $facetname = $xml->findvalue('os:facetName', $facetnode);
426 my $zfacetfield = {};
427 bless $zfacetfield, 'Net::Z3950::FacetField';
428 $zfacetlist->[$i++] = $zfacetfield;
429 my $zattributes = [];
430 bless $zattributes, 'Net::Z3950::RPN::Attributes';
431 $zfacetfield->{'attributes'} = $zattributes;
433 bless $zattribute, 'Net::Z3950::RPN::Attribute';
434 $zattribute->{'attributeType'} = 1;
435 $zattribute->{'attributeValue'} = $facetname;
436 $zattributes->[0]=$zattribute;
437 my $zfacetterms = [];
438 bless $zfacetterms, 'Net::Z3950::FacetTerms';
439 $zfacetfield->{'terms'} = $zfacetterms;
440 my $debugfacets = $facetname . " :";
442 foreach my $facetterm ( $xml->findnodes('os:facetTerm',$facetnode) ) {
443 # They seem to misspell frequency. Check both, for the case they
444 # get around to fixing it.
445 my $freq = $xml->findvalue('os:frequence', $facetterm) ||
446 $xml->findvalue('os:frequency', $facetterm);
447 my $term = $xml->findvalue('os:term', $facetterm);
448 $debugfacets .= " '" . $term . "'=" . $freq;
450 bless $zfacetterm, 'Net::Z3950::FacetTerm';
451 $zfacetterm->{'term'} = $term;
452 $zfacetterm->{'count'} = $freq;
453 $zfacetterms->[$j++] = $zfacetterm;
455 yazlog($debugfacets) if ($debug);
458 $session->{facets} = $zfacetlist;
463 # Check that we have the needed records in the cache, fetch if need be
468 my $session = $href->{HANDLE};
470 if (defined($href->{COMP})) {
471 $session->{comp} = $href->{COMP};
473 $session->{comp} = $session->{def_comp};
475 yazlog("Checking start=$start, num=$num") if ($debug);
476 if ( $num > $session->{chunksize} ) {
477 $session->{chunksize} = $num;
479 # Skip the records we already have
480 my $comp = $session->{comp};
481 while ( $num && $session->{records}->{$comp}->[$start] ) {
485 if ( $num == 0 && $session->{hits} ) { # we have a hit count and have them all
486 yazlog("no need to get more records") if ($debug);
487 return; # no need to fetch anything
489 my ($xml,$page) = opensearchclient($href, $start);
491 return; # error has been set already
493 if ( ! $session->{hits} ){
494 my $hits = $xml->findvalue('//os:searchResponse/os:result/os:hitCount');
495 if ( length($hits) == 0 ) { # can't just say !$hits, triggers on "0"
496 return err($href, 100, "No hitcount in response");
498 $session->{hits} = $hits;
499 # Do not attempt to extract facets on zero hits
504 get_results($href,$xml);
507 # Remove the sortby clause from the CQL query, translate to
508 # opensearch sort parameter, and put it in the session.
509 # Handles only one sort key
513 my $session = $href->{HANDLE};
515 if ( $qry =~ /^(.*?) +sortby *(\w+)(\/(\w+))?(.*) *$/ ) {
516 yazlog("Separated query '$1' from sort clause '$2' '$3' leaving '$5' ") if $debug;
519 my $direction = $4 || "ascending";
521 return err($href, 211, "Only one sort key supported" );
523 if ( $sortclause ne "random" ) {
524 $sortclause .= "_" . $direction;
527 return ( $qry, $sortclause );
530 ################# Query translation
534 my $session = $href->{HANDLE};
537 my $dbbase = $session->{dbbase};
538 if (!defined($fields->{$dbbase})) {
539 return err($href, 3, "No mapping defined for numeric attribtues");
541 $fr = $fields->{$dbbase};
542 my $a_set = '1.2.840.10003.3.1';
544 my $a_u = 1016; # use, type 1
545 my $a_r = 3; # relation, type 2
546 my $a_p = -1; # position, type 3
547 my $a_s = -1; # structure, type 4
548 my $a_t = 100; # truncation, type 5
549 my $a_c = -1; # completeness, type 6
550 while (my $attr = $t->{attributes}->[$i++]) {
551 my $t = $attr->{attributeType};
552 my $v = $attr->{attributeValue};
555 if (defined($attr->{attributeSet})) {
556 $a_set = $attr->{attributeSet};
569 return err($href, 113, $t);
576 my $structure_ok = 0;
577 my $completeness_ok = 0;
578 foreach my $f (keys %{$fr}) {
580 foreach my $s (keys %{$fr->{$f}}) {
581 my $v = $fr->{$f}->{$s};
585 } elsif (defined($fr->{$f}->{set}) && $a_set ne $fr->{$f}->{set}) {
592 if ($v =~ /^\d+?$/) {
603 if ($a_p != -1 && $v != $a_p) {
612 } elsif ($v =~ /^\d+?$/) {
618 } elsif ($v eq "pw") {
619 if ($a_s != 1 && $a_s != 2) {
627 if ($v =~ /^\d+?$/) {
628 $accept = undef unless $v == $a_t;
631 $accept = undef unless $v =~ /l/;
632 } elsif ($a_t == 2) {
633 $accept = undef unless $v =~ /r/;
635 $accept = undef unless $a_t == 100;
640 if ($a_c != -1 && $v =~ /^\d+?$/ && $v != $a_c) {
643 $completeness_ok = 1;
647 $best = $accept if $accept;
649 if (!defined($best)) {
650 return err($href, 114, $a_u) unless ($use_ok);
651 return err($href, 117, $a_r) unless ($relation_ok);
652 return err($href, 119, $a_p) unless ($position_ok);
653 return err($href, 118, $a_s) unless ($structure_ok);
654 return err($href, 122, $a_c) unless ($completeness_ok);
655 return err($href, 123, "");
663 my $session = $href->{HANDLE};
670 my $term = $t->{term};
672 # ### Can not test, simpleServer gets such a bad handle
673 return err($href, 108, # malformed query
674 "Empty term not supported" );
677 while (my $attr = $t->{attributes}->[$i++])
679 #print "Attr: " . Dumper($attr) ;
680 my $aval = $attr->{attributeValue};
681 my $type = $attr->{attributeType};
683 if ($aval =~ /^\d+?$/) { # numeric use
684 $field = map_use_attr($href, $t);
685 return if ($href->{ERR_CODE});
689 } elsif ($type == 2) { # Relation
692 } elsif ($aval == 2) {
694 } elsif ($aval == 3) {
696 } elsif ($aval == 4) {
698 } elsif ($aval == 5) {
701 return err($href, 117, # unsupp relation
702 $aval, "Unsupported relation $aval");
704 } elsif ($type == 3) { # position
705 if ($aval < 1 || $aval > 3) {
706 return err ($href, 119, # unsupp position
707 $aval, "Unsupported position $aval");
709 } elsif ($type == 4) { # structure
710 if ($aval == 1) { # phrase
711 # Not working, DBC-112
714 } elsif ($aval == 2 || $aval == 4) { # word / year
715 # nothing special to do
717 return err($href, 118, # unsupp structure
718 $aval, "Unsupported structure $aval");
720 } elsif ($type == 5) { # truncation
721 if ($aval == 1) { # right trunc
723 } elsif ($aval == 2) {
725 } elsif ($aval == 3) {
728 } elsif ($aval == 100) { # none
731 return err($href, 120, # unsupp relation
732 $aval, "Unsupported truncation $aval");
734 } elsif ($type == 6) { # completeness
736 } elsif ($type == 7) { # sort
737 if ($aval != 1 && $aval != 2) {
738 return err($href, 237, # illegal sort
739 $aval, "Illegal sort (attr 7): $aval");
743 return err($href, 113, # unupported attribute type
745 "Unsupported attribute type= " . $type.
746 " val='" . $aval ."'");
750 if ($session->{sort}) {
751 return err($href, 237, # illegal sort
752 "Only one sort supported");
754 my $direction = "_ascending";
755 if ($sort == 2) { $direction = "_descending"; }
756 if ($field eq "random" ) { $direction = ""; }
757 $session->{sort} = $field.$direction;
758 return $magic_sort_indicator;
760 if (($rtrunc || $ltrunc) && $quote) { # We can not do truncation on phrases
761 return err($href, 120, # unsupp trunc
762 "", "Can not do truncation on phrases");
764 # Escape characters that would be taken as wildcards
765 $term =~ s/([*?^"])/\\$1/g;
766 $term = $quote.$ltrunc.$term.$rtrunc.$quote;
769 $clause = $field . " " . $operator . " " . $term;
771 yazlog("q_term: $clause" ) if ($debug);
779 if ( $class eq "Net::Z3950::RPN::Term" ) {
780 return q_term($href, $n);
782 my %ops = ( "Net::Z3950::RPN::And" => $op_and,
783 "Net::Z3950::RPN::Or" => $op_or,
784 "Net::Z3950::RPN::AndNot" => $op_not );
785 my $op = $ops{$class} ;
787 my $left = q_node($href,$n->[0]);
788 return "" unless $left;
789 my $right = q_node($href,$n->[1]);
790 return "" unless $right;
791 return $left if ( $right eq $magic_sort_indicator );
792 return $right if ( $left eq $magic_sort_indicator );
793 my $clause = "( $left $op $right )";
794 yazlog("q_node: $clause") if ($debug);
798 $opname =~ s/^.*:+//; # Remove the Net::... for error msg
799 return err($href,110, # operator not supported
801 "Operator '$class' not supported. Only 'And'");
807 my $qry = $href->{RPN};
808 my $class = ref($qry);
809 yazlog("Translating query") if ($debug);
810 if ( $class ne "Net::Z3950::APDU::Query" ) {
811 return err($href,100, # unspecified error
812 "Programming error, no query found",
813 "Class of query is '$class', not Net::Z3950::APDU::Query" );
815 # TODO - check attributeSet
816 my $query = q_node($href,$qry->{query});
817 yazlog("Translated query: $query" ) if ($debug);
822 ################# Request callbacks
827 $session->{chunksize} = $chunksize; # to start with
828 $session->{records} = {};
829 $href->{HANDLE} = $session;
830 dumphandle( $href, "Init:");
835 my $session = $href->{HANDLE};
836 dumphandle( $href, "Search:");
837 $session->{hits} = 0;
838 $session->{facets} = [];
839 $session->{records} = {};
840 my $db = $href->{DATABASES}[0];
841 $session->{dbbase} = $db;
842 $session->{dbbase} =~ s/,.*$//; # without extraargs
843 if (! exists $constantparams{$session->{dbbase}}) {
844 return err( $href, 235, #Database does not exist
847 if (defined($objectformat->{$session->{dbbase}})) {
848 $session->{def_comp} = $objectformat->{$session->{dbbase}};
850 $session->{def_comp} = "dkabm";
852 if ($db =~ /.*,(.*)$/ ) {
853 $session->{dbargs} = $1;
855 $session->{dbargs} = "";
857 $session->{sort} = '';
858 my $qry = $href->{CQL};
861 ( $qry, $sortby ) = fixsortquery($href,$qry) ; # Remove CQL sortby clause
863 return; # error already set
864 $session->{sort} = $sortby;
867 $qry = q_query($href);
870 return; # err is already set
872 $session->{query} = "&query=" . uri_escape($qry);
873 my $number = $href->{PRESENT_NUMBER};
874 #my $number = $session->{chunk_size};
875 getrecords($href, 1, $number);
876 $href->{HITS} = $session->{hits};
877 if ( $session->{facets} ) {
878 $href->{OUTPUTFACETS} = $session->{facets};
882 sub present_handler {
887 dumphandle( $href, "Fetch:");
888 my $offset = $href->{OFFSET};
889 my $session = $href->{HANDLE};
890 getrecords($href,$offset,1);
891 my $comp = $session->{comp};
892 my $record = $session->{records}->{$comp}->[$offset];
894 return err( $href, 13, # present out of range,
897 $href->{REP_FORM} = Net::Z3950::OID::xml;
898 $href->{RECORD} = $record;
899 $href->{LEN} = length($record);
900 $href->{NUMBER} = $offset;
901 $href->{BASENAME} = $session->{dbbase};
906 dumphandle( $href, "Close:");
910 ########### Main program
913 my $handler = new Net::Z3950::SimpleServer(START => \&readconfig,
914 INIT => \&init_handler,
915 CLOSE => \&close_handler,
916 SEARCH => \&search_handler,
917 FETCH => \&fetch_handler,
918 PRESENT => \&present_handler);
920 $handler->launch_server("opensearch-gw.pl", @ARGV);