Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  

logrecord.C

Go to the documentation of this file.
00001 // $Id: logrecord_C-source.html,v 1.1 2001/04/16 17:43:02 dvermeir Exp $
00002 
00003 #include <string>
00004 #include <ctype.h> // isxdigit, isalnum
00005 #include "logrecord.h"
00006 #include "configuration.h"
00007 #include "date.h"
00008 
00009 // Retrieve and store relevant information from a line from a log file.
00010 bool
00011 LogRecord::parse_line(const string& line,const Configuration& conf) {
00012 /* A parse succeeds if all components can be parsed.
00013    We only parse the parts that are needed according to
00014    conf.criteria();
00015 */
00016 const set<Configuration::CRITERIUM>& criteria(conf.criteria());
00017 
00018 if (criteria.count(Configuration::DATE))
00019   if (!parse_date(line))
00020     return false;
00021 
00022 if (criteria.count(Configuration::PATH))
00023   if (!parse_path(line))
00024     return false;
00025 
00026 if (criteria.count(Configuration::DOMAIN))
00027   if (!parse_domain(line))
00028     return false;
00029 
00030 return true;
00031 }
00032 
00033 // Retrieve and store date information from a line from a log file.
00034 bool
00035 LogRecord::parse_date(const string& line) {
00036 /* Select date part: just after first '[' up to first folloing ' '
00037    e.g.
00038 
00039         spider1.tiscalinet.be - - [02/Nov/2000:10:20:36 +0100] ..
00040 
00041    will select
00042 
00043         02/Nov/2000:10:20:36
00044 */
00045 string::size_type date_start(line.find_first_of('['));
00046 if (date_start==string::npos)
00047   return false;
00048 string::size_type date_end(line.find_first_of(' ',date_start));
00049 if (date_end==string::npos)
00050   return false;
00051 string            date_string(line,date_start+1,date_end - date_start-1);
00052 if (date_string.size()==0)
00053   return false;
00054 /* Make date_string acceptable to the Date::Date(const char*) parser.
00055 
00056    1. replace '/' by '-'. In the example, this will yield 
00057 
00058         02-Nov-2000:10:20:36
00059 */
00060 for (string::size_type i=0; i<date_string.size(); ++i)
00061   if (date_string[i]=='/')
00062     date_string[i] = '-';
00063 /*
00064    2. Replace first ':' by space. In the example, this will yield 
00065 
00066         02-Nov-2000 10:20:36
00067 */
00068 date_string.replace(date_string.find_first_of(':'),1," ");
00069 
00070 /* Now parse using Date::Date(const string&). It will throw an exception
00071    if the date cannot be parsed. In that case, we catch the exception
00072    and return false.
00073 */
00074 try {
00075   Date  d(date_string);
00076   date_.set(d.year(),d.month(),d.day(),d.hours());
00077   }
00078 catch (exception& e) {
00079   cerr << "LogRecord::parse_date failed: " << e.what() << endl;
00080   return false;
00081   }
00082 return true;
00083 }
00084 
00086 
00087 // Return value of hex digit.
00088 static inline int 
00089 hexdigit(char c) { // assert(isxdigit(c))
00090 if (isdigit(c)) 
00091   return c - '0';
00092 else if (isupper(c))
00093   return c - 'A' + 10;
00094 else // must be lower case letter
00095   return c - 'a' +10;
00096 }
00097 
00098 // In-place decoding of encoded url, see RFC 1738.
00099 void
00100 www_decode(string& s) {
00101 unsigned int i(0); // input index
00102 unsigned int j(0); // output index
00103 unsigned int len(s.size());
00104 for (i=0; (i<len); ++i,++j)
00105   switch (s[i]) {
00106     // case '+': s[j] = ' '; break; // Apparently not used in the log file.
00107     case '%': // the following 2 characters must be hex digits: 0-9,a-f
00108       if ((i+2<len) && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
00109         s[j] = hexdigit(s[i+1])* 16 + hexdigit(s[i+2]);
00110         i += 2;
00111         }
00112       else
00113         s[j] = s[i];
00114       break;
00115     default: s[j] = s[i]; break;
00116     }
00117 s.resize(j);
00118 }
00120 
00121 // Retrieve and store path information from a line from a log file.
00122 bool
00123 LogRecord::parse_path(const string& line) {
00124 /* The path part can be found between the first and second occurrences
00125    of `"' (double quote). An example is shown below.
00126 
00127     "GET /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00128 
00129    This will result in 
00130 
00131      _path = <"","ssl","kiesoos.cgi">
00132 */
00133 string::size_type path_start(line.find_first_of('\"'));
00134 if (path_start==string::npos)
00135   return false;
00136 /* We are not interested in the verb (e.g. GET), so we skip until after the first ' '.
00137    In the example, path_start would then point to.
00138 
00139     /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00140 */
00141 path_start = line.find_first_of(' ',path_start);
00142 if (path_start==string::npos)
00143   return false;
00144 ++path_start; // After ' '.
00145 /*  We only want the path until the first '?', '#' or ' '. In the example
00146     this should result in.
00147 
00148     path_string = /ssl/kiesoos.cgi
00149 */
00150 
00151 string::size_type path_end = line.find_first_of("#? ",path_start);
00152 if (path_end==string::npos)
00153   return false;
00154 
00155 if (path_end==path_start) // cannot have an empty path
00156   return false;
00157 
00158 string path_string(line,path_start,path_end - path_start);
00159 
00160 // Decode to translate "%7E" back to '~' etc.
00161 www_decode(path_string);
00162 
00163 return path_.parse(path_string);
00164 }
00165 
00166 // Retrieve and store domain information from a line from a log file.
00167 bool
00168 LogRecord::parse_domain(const string& line) {
00169 /* The domain informatin is easy to find: it's the first part of
00170    a line, followed by a ' ', as illustrated in the following example.
00171 
00172         igwe.vub.ac.be - - [12/Feb/2000:13:18:49 +0100] ...
00173         
00174 */
00175 string::size_type domain_start(0);
00176 string::size_type domain_end(line.find_first_of(' '));
00177 if (domain_end==string::npos)
00178   return false;
00179 string domain_string(line,domain_start,domain_end);
00180 
00181 return domain_.parse(domain_string);
00182 }

httpstats-stage04 [ 7 April, 2001]