00001
00002 #include <vector>
00003
00004 #include <sstream>
00005 #include <algorithm>
00006 #include <dvutil/tostring.h>
00007 #include <dvutil/date.h>
00008 #include <config.h>
00009 #include "index.h"
00010 #include "word.h"
00011 #include "wordstreamiterator.h"
00012
00013 Index::Index(size_t word_sz, istream* is): word_size_(word_sz) {
00014 if (is)
00015 ignore(*is);
00016 }
00017
00018 Index::~Index() {
00019 }
00020
00021 static const char* MAGIC("#textindexer");
00022
00023 ostream&
00024 operator<<(ostream& os, const Index& index) {
00025
00026 os << MAGIC << " " << VERSION << " -- do not edit this file" << endl;
00027
00028 os << index.dates_.size() << endl;
00029 map<const string*,int> file_nrs;
00030 size_t n(0);
00031 for (Index::Dates::const_iterator i = index.dates_.begin();
00032 i!=index.dates_.end(); ++i) {
00033 os << (*i).second << ' ' << *((*i).first) << "\n";
00034 file_nrs[(*i).first] = n++;
00035 }
00036 for (Index::iterator i=index.begin(); i!= index.end(); ++i) {
00037 os << (*i).first;
00038 const Index::Files& files((*i).second);
00039 for (Index::Files::const_iterator j = files.begin(); j!=files.end(); ++j)
00040 os << " " << file_nrs[*j];
00041 os << "\n";
00042 }
00043 return os;
00044 }
00045
00046 istream&
00047 operator>>(istream& is, Index& index) throw (runtime_error) {
00048
00049
00050
00051 string magic;
00052 string junk;
00053 double version;
00054 is >> magic >> version;
00055 getline(is,junk);
00056 if (magic!=MAGIC)
00057 throw runtime_error("Bad magic in textindexer index file");
00058
00059 vector<const string*> files;
00060 StringPool& pool(index.pool());
00061 size_t n;
00062 is >> n;
00063 string filename;
00064 time_t t;
00065 for (size_t i=0; i<n; ++i) {
00066 is >> t;
00067 if (t==0)
00068 throw runtime_error("Index reader: time == 0");
00069 is.ignore();
00070 getline(is, filename);
00071 if (filename.size()==0)
00072 throw runtime_error("Index reader: empty filename ");
00073 const string* pfn(pool[filename]);
00074 files.push_back(pfn);
00075 index.dates_[pfn] = t;
00076 }
00077 string line;
00078 while (getline(is,line)) {
00079 istringstream iss(line);
00080 Word w;
00081 iss >> w;
00082 size_t i;
00083 while (iss>>i) {
00084 if (i>=files.size())
00085 throw runtime_error(Dv::Util::tostring(i) + ": illegal file#");
00086 index.map_[w.str()].insert(files[i]);
00087 }
00088 }
00089 return is;
00090 }
00091
00092 time_t
00093 Index::date(const string& fn) const {
00094 const string* pfn(file(fn));
00095 if (!pfn)
00096 return 0;
00097 Dates::const_iterator i = dates_.find(pfn);
00098 if (i==dates_.end())
00099 return 0;
00100 return (*i).second;
00101 }
00102
00103 const Index::Files*
00104 Index::query(const string& s) const {
00105 string ss(s,0,word_size());
00106 Map::const_iterator i(map_.find(ss));
00107 if (i==map_.end())
00108 return 0;
00109 return &(*i).second;
00110 }
00111
00112
00113 Index::Files&
00114 Index::query(const Strings& q, Index::Files& final_result) const {
00115 Files result[2];
00116 int r(0);
00117 for (Strings::const_iterator i = q.begin(); i!=q.end(); ++i) {
00118 const Index::Files* qr(query(*i));
00119 if ( qr == 0) {
00120 final_result.clear();
00121 return final_result;
00122 }
00123
00124 int nr((r+1)%2);
00125 if (i==q.begin())
00126 result[nr] = *qr;
00127 else {
00128 result[nr].clear();
00129 set_intersection(result[r].begin(), result[r].end(),
00130 qr->begin(), qr->end(),
00131 inserter(result[nr],result[nr].end()));
00132 }
00133 r = nr;
00134 }
00135 final_result = result[r];
00136 return final_result;
00137 }
00138
00139 size_t
00140 Index::insert(const string& fn, istream& is) {
00141 const string* pfn(file(fn));
00142 if (pfn)
00143 remove(fn);
00144 pfn = pool_[fn];
00145 size_t n(0);
00146 wordstream_iterator i(is,ignore());
00147 wordstream_iterator end;
00148 for (; i!=end; ++i) {
00149 string w(*i,0,word_size());
00150 n += (map_[w].insert(pfn).second ? 1 : 0);
00151 }
00152 dates_[pfn] = Dv::Util::Date().time();
00153 return n;
00154 }
00155
00156 void
00157 Index::remove(const string& fn) {
00158 const string* pfn(file(fn));
00159 if (pfn == 0)
00160 return;
00161 for (Map::iterator i = map_.begin(); i!=map_.end(); ++i) {
00162 Files& files((*i).second);
00163 files.erase(pfn);
00164 if (files.size() == 0)
00165 map_.erase(i);
00166 }
00167 dates_.erase(pfn);
00168 pool_.remove(fn);
00169 }
00170
00171 void
00172 Index::ignore(const string& w) {
00173 ignore_.insert(w);
00174 }
00175
00176 void
00177 Index::ignore(istream& is) {
00178 Word w;
00179 while (is>>w)
00180 ignore(w);
00181 }