2 # Analyzing DBC's example queries
3 # Step 2: Count identical queries, and query types
4 # Assumes x2 is the result of process1.pl, sorted alphabetically
8 open F, "x2" or die "could not open x2: $!\n";
10 open OUT, "| sort -n -r > x3" or die "could not open sort>x3 for writing: $!\n";
11 print OUT "#count ; query \n";
14 my $count = 1; # how many times seen the same query
16 my $booleans = 0; # queries that contain a boolean operator
17 my $fields = 0; # queries that start with field=, f.ex. title or author
18 my $multifields = 0; # queries that contain more than one field
19 my $simplequeries = 0; # queries without booleans or fields
20 my $uniquequeries = 0;
29 my ( $query, $hits, $pages ) = split (';');
30 next if ( $hits =~ /["a-z]/); # semicolons in original query, ignore
31 if ( $thisquery eq $query ){
34 print OUT "$count ; $thisquery \n";
35 $totalqueries += $count;
37 $singlehits += 1 if ($hits <= 1 );
40 if ( / og /i || / eller /i || / ikke /i ) {
46 while ( $query =~ /([^ (=%'"]+)=\s*([^ ]+)/g ) {
47 if (++$fieldcount >1 ) {
49 print "OOPS: $fieldcount $query \n" unless $is_boolean;
50 $multifields += $count;
54 #print "Field loop: '$fld' $query \n";
56 $field{$fld} = 0 unless defined($field{$fld});
57 $field{$fld} += $count;
60 $simplequeries += $count;
63 $q2 =~ s/\S+=//; # remove fields
67 $singleterms += $count if ($q2 =~ /^\s*\S+\s*$/ );
68 $doubleterms += $count if ($q2 =~ /^\s*\S+\s+\S+\s*$/ );
69 $tripleterms += $count if ($q2 =~ /^\s*\S+\s+\S+\s+\S+\s*$/ );
70 $manyterms += $count if ($q2 =~ /^\s*\S+\s+\S+\s+\S+\s+\S+/ );
71 #print "$query ; $hits ; $pagecount\n";
78 open OUT, ">x4" or die "could not open x4 for writing summary: $!\n";
81 my ($capt, $number, $dopercent ) = @_;
84 $percents = "". int( $number*1000 / $totalqueries ) / 10 . "%" ;
86 while (length($percents) < 6 ) { $percents = " $percents"; }
87 print OUT sprintf("%-20s %7d %s\n", $capt, $number, $percents);
90 line "Total queries", $totalqueries, 0;
91 line "Unique queries", $uniquequeries, 1;
93 line "Boolean queries", $booleans, 1;
94 line "Fielded queries", $fields, 1;
95 line "Multiple fields", $multifields, 1;
96 line "Simple queries", $simplequeries, 1;
97 line "One-hit queries", $singlehits, 1;
99 line "Single term", $singleterms, 1;
100 line "Double terms", $doubleterms, 1;
101 line "Triple terms", $tripleterms, 1;
102 line "Many terms", $manyterms, 1;
104 print OUT "\nFields\n";
105 for my $k (sort{ $field{$b} <=> $field{$a} } keys(%field) ) {
106 line " $k", $field{$k}, 1 if ($field{$k} > 100);