Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members   Examples  

index.C

Go to the documentation of this file.
00001 // $Id: index.C,v 1.1.1.1 2002/03/24 12:37:01 dvermeir Exp $
00002 #include <vector>
00003 // #include <limits>
00004 #include <sstream>
00005 #include <algorithm>
00006 #include <dvutil/tostring.h>
00007 #include <dvutil/date.h>
00008 #include <config.h>
00009 #include "index.h"
00010 #include "word.h"
00011 #include "wordstreamiterator.h"
00012 
00013 Index::Index(size_t word_sz, istream* is): word_size_(word_sz) {
00014 if (is)
00015   ignore(*is);
00016 }
00017 
00018 Index::~Index() {
00019 }
00020 
00021 static const char* MAGIC("#textindexer");
00022 
00023 ostream&
00024 operator<<(ostream& os, const Index& index) {
00025 // Write first line: MAGIC version#
00026 os << MAGIC << " " << VERSION << " -- do not edit this file" << endl;
00027 // Write pool contents & remember the position of each pointer.
00028 os << index.dates_.size() << endl;
00029 map<const string*,int> file_nrs;
00030 size_t n(0);
00031 for (Index::Dates::const_iterator i = index.dates_.begin(); 
00032      i!=index.dates_.end(); ++i) {
00033   os << (*i).second << ' ' << *((*i).first) << "\n";
00034   file_nrs[(*i).first] = n++;
00035   }
00036 for (Index::iterator i=index.begin(); i!= index.end(); ++i) {
00037   os << (*i).first;
00038   const Index::Files& files((*i).second);
00039   for (Index::Files::const_iterator j = files.begin(); j!=files.end(); ++j)
00040     os << " " << file_nrs[*j];
00041   os << "\n";
00042   }
00043 return os;
00044 }
00045 
00046 istream&
00047 operator>>(istream& is, Index& index) throw (runtime_error) {
00048 // Read first line: MAGIC version junk
00049 // Normally, we would use ignore(numeric_limits<int>::max(), '\n')
00050 // but #include <limits> does not work in this g++ version.
00051 string magic;
00052 string junk;
00053 double version;
00054 is >> magic >> version;
00055 getline(is,junk);
00056 if (magic!=MAGIC)
00057   throw runtime_error("Bad magic in textindexer index file");
00058 // Perhaps a test on version should come here.
00059 vector<const string*> files;
00060 StringPool& pool(index.pool());
00061 size_t n;
00062 is >> n;
00063 string filename;
00064 time_t t;
00065 for (size_t i=0; i<n; ++i) {
00066   is >> t;
00067   if (t==0)
00068     throw runtime_error("Index reader: time == 0");
00069   is.ignore();
00070   getline(is, filename);
00071   if (filename.size()==0)
00072     throw runtime_error("Index reader: empty filename ");
00073   const string* pfn(pool[filename]);
00074   files.push_back(pfn);
00075   index.dates_[pfn] = t;
00076   }
00077 string line;
00078 while (getline(is,line)) {
00079   istringstream iss(line);
00080   Word w;
00081   iss >> w;
00082   size_t i;
00083   while (iss>>i) {
00084     if (i>=files.size())
00085       throw runtime_error(Dv::Util::tostring(i) + ": illegal file#");
00086     index.map_[w.str()].insert(files[i]);
00087     }
00088   }
00089 return is;
00090 }
00091 
00092 time_t
00093 Index::date(const string& fn) const {
00094 const string* pfn(file(fn));
00095 if (!pfn)
00096   return 0;
00097 Dates::const_iterator i = dates_.find(pfn);
00098 if (i==dates_.end())
00099   return 0;
00100 return (*i).second;
00101 }
00102 
00103 const Index::Files*
00104 Index::query(const string& s) const {
00105 string ss(s,0,word_size());
00106 Map::const_iterator i(map_.find(ss));
00107 if (i==map_.end())
00108   return 0;
00109 return &(*i).second;
00110 }
00111 
00112 
00113 Index::Files&
00114 Index::query(const Strings& q, Index::Files& final_result) const {
00115 Files   result[2];
00116 int     r(0); // index of result
00117 for (Strings::const_iterator i = q.begin(); i!=q.end(); ++i) {
00118   const Index::Files* qr(query(*i));
00119   if ( qr == 0) { // one of the strings not found: just return empty set
00120     final_result.clear();
00121     return final_result;
00122     }
00123   // assert qr != 0
00124   int nr((r+1)%2); // index of next result
00125   if (i==q.begin()) // first result, just store in result[nr]
00126     result[nr] = *qr;
00127   else {
00128     result[nr].clear();
00129     set_intersection(result[r].begin(), result[r].end(), 
00130                      qr->begin(), qr->end(),
00131                      inserter(result[nr],result[nr].end()));
00132     }
00133   r = nr;
00134   }
00135 final_result = result[r];
00136 return final_result;
00137 }
00138   
00139 size_t
00140 Index::insert(const string& fn, istream& is) {
00141 const string* pfn(file(fn));
00142 if (pfn)
00143   remove(fn);
00144 pfn = pool_[fn];
00145 size_t n(0);
00146 wordstream_iterator i(is,ignore());
00147 wordstream_iterator end;
00148 for (; i!=end; ++i) {
00149   string w(*i,0,word_size());
00150   n += (map_[w].insert(pfn).second ? 1 : 0);
00151   }
00152 dates_[pfn] = Dv::Util::Date().time();
00153 return n;
00154 }
00155 
00156 void
00157 Index::remove(const string& fn) {
00158 const string* pfn(file(fn));
00159 if (pfn == 0)
00160   return; 
00161 for (Map::iterator i = map_.begin(); i!=map_.end(); ++i) {
00162   Files& files((*i).second);
00163   files.erase(pfn);
00164   if (files.size() == 0)
00165     map_.erase(i);
00166   }
00167 dates_.erase(pfn);
00168 pool_.remove(fn);
00169 }
00170 
00171 void
00172 Index::ignore(const string& w) {
00173 ignore_.insert(w);
00174 }
00175 
00176 void
00177 Index::ignore(istream& is) {
00178 Word w;
00179 while (is>>w)
00180  ignore(w);
00181 }

textindexer-0.2 [27 March, 2002]