#!/usr/bin/perl -w # Copyright (C) 2009 Arnoldo Jose Muller Molina # This program is free software: you can # redistribute it and/or modify it under the terms of the GNU General Public # License as published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. This program is distributed # in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # See the GNU General Public License for more details. You should have received # a copy of the GNU General Public License along with this program. If not, see # . # Prepare a query and a DB from a newline separated DB of data. # we make sure also that DB and query do not have repeated objects. # usage: #prepareData.pl # where: # is the file name of a database of newline separated objects. # the final size of the db. # is the size of the query. # preceded by a - sign is a k-nn and otherwise is a range query. # Output: # a new file .query will be create with the query objects. # a new file .db will be create with objects - the query # (objects inside the generated query will not be included). $input_file = $ARGV[0]; #input file. $db_size = $ARGV[1]; # final size of the db file $query_size = $ARGV[2]; # number of queries. $query = $ARGV[3]; # query mode open IN , "<$input_file" or die "Could not open DB file $input_file"; open DB , ">$input_file.db" or die "Could not open output DB file $input_file.db"; open Q , ">$input_file.query" or die "Could not open output query file $input_file.query"; # load unique lines into the hash while(){ $db{$_} = ":)"; } # lines of the db $totalLines = scalar(keys %db); print "total lines: $totalLines\n"; # generate some random numbers $i = 0; while($i < $query_size){ $query{int(rand($totalLines))} = ":)"; $i = scalar(keys %query); } # separate the query from the db. $i = 0; $final_db_size = 0; foreach $key (keys %db) { if(exists $query{$i}){ print Q "$query,$key"; }else{ if($final_db_size < $db_size){ print DB $key; $final_db_size++; } } $i++; } # finish the query print Q "-0\n"; close(Q); close(DB); close(IN); # count the lines of a file sub countLines { my($file) = @_; open IN , "<$file" or die "Could not open DB file $file"; my $totalLines = 0; while(){ $totalLines++; } close(IN); return $totalLines; }