Main Page   Class Hierarchy   Compound List   File List   Compound Members   File Members  


Go to the documentation of this file.
00001 // $Id: logrecord.C,v 1.4 2001/04/12 14:55:55 dvermeir Exp $
00003 #include <string>
00004 #include <ctype.h> // isxdigit, isalnum
00005 #include "logrecord.h"
00006 #include "date.h"
00008 // Retrieve and store relevant information from a line from a log file.
00009 bool
00010 LogRecord::parse_line(const string& line) {
00011 /* Clear current vector<string> data members. This is
00012    important because parsing will append to path_ and domain_.
00013 */
00014 path_.clear(); 
00015 domain_.clear();
00016 /* A parse succeeds if all components can be parsed.
00017    This should be modified so that we only parse what is necessary,
00018    according to the configuration. But for now, we are too lazy.
00019 */
00020 return parse_date(line) && parse_path(line) && parse_domain(line);
00021 }
00023 // Retrieve and store date information from a line from a log file.
00024 bool
00025 LogRecord::parse_date(const string& line) {
00026 /* Select date part: just after first '[' up to first folloing ' '
00027    e.g.
00029 - - [02/Nov/2000:10:20:36 +0100] ..
00031    will select
00033         02/Nov/2000:10:20:36
00034 */
00035 string::size_type date_start(line.find_first_of('['));
00036 string::size_type date_end(line.find_first_of(' ',date_start));
00037 string            date_string(line,date_start+1,date_end - date_start-1);
00039 /* Make date_string acceptable to the Date::Date(const char*) parser.
00041    1. replace '/' by '-'. In the example, this will yield 
00043         02-Nov-2000:10:20:36
00044 */
00045 for (string::size_type i=0; i<date_string.size(); ++i)
00046   if (date_string[i]=='/')
00047     date_string[i] = '-';
00048 /*
00049    2. Replace first ':' by space. In the example, this will yield 
00051         02-Nov-2000 10:20:36
00052 */
00053 date_string.replace(date_string.find_first_of(':'),1," ");
00055 /* Now parse using Date::Date(const string&). It will throw an exception
00056    if the date cannot be parsed. In that case, we catch the exception
00057    and return false.
00058 */
00059 try {
00060   Date  date(date_string);
00061   date_ = date;
00062   // We are only interested in hours, not minutes or seconds.
00063   date_.minutes(0);
00064   date_.seconds(0);
00065   }
00066 catch (exception& e) {
00067   cerr << "parse_date: " << e.what() << endl;
00068   return false;
00069   }
00070 return true;
00071 }
00075 // Return value of hex digit.
00076 static inline int 
00077 hexdigit(char c) { // assert(isxdigit(c))
00078 if (isdigit(c)) 
00079   return c - '0';
00080 else if (isupper(c))
00081   return c - 'A' + 10;
00082 else // must be lower case letter
00083   return c - 'a' +10;
00084 }
00086 // In-place decoding of encoded url, see RFC 1738.
00087 void
00088 www_decode(string& s) {
00089 unsigned int i(0); // input index
00090 unsigned int j(0); // output index
00091 unsigned int len(s.size());
00092 for (i=0; (i<len); ++i,++j)
00093   switch (s[i]) {
00094     // case '+': s[j] = ' '; break; // Apparently not used in the log file.
00095     case '%': // the following 2 characters must be hex digits: 0-9,a-f
00096       if ((i+2<len) && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
00097         s[j] = hexdigit(s[i+1])* 16 + hexdigit(s[i+2]);
00098         i += 2;
00099         }
00100       else
00101         s[j] = s[i];
00102       break;
00103     default: s[j] = s[i]; break;
00104     }
00105 s.resize(j);
00106 }
00109 // Retrieve and store path information from a line from a log file.
00110 bool
00111 LogRecord::parse_path(const string& line) {
00112 /* The path part can be found between the first and second occurrences
00113    of `"' (double quote). An example is shown below.
00115     "GET /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00117    This will result in 
00119      _path = <"","ssl","kiesoos.cgi">
00120 */
00121 string::size_type path_start(line.find_first_of('\"'));
00122 if (path_start==string::npos)
00123   return false;
00124 /* We are not interested in the verb (e.g. GET), so we skip until after the first ' '.
00125    In the example, path_start would then point to.
00127     /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00128 */
00129 path_start = line.find_first_of(' ',path_start);
00130 if (path_start==string::npos)
00131   return false;
00132 ++path_start; // After ' '.
00133 /*  We only want the path until the first '?', '#' or ' '. In the example
00134     this should result in.
00136     path_string = /ssl/kiesoos.cgi
00137 */
00139 string::size_type path_end = line.find_first_of("#? ",path_start);
00140 if (path_end==string::npos)
00141   return false;
00143 if (path_end==path_start) // cannot have an empty path
00144   return false;
00146 string path_string(line,path_start,path_end - path_start);
00148 // Decode to translate "%7E" back to '~' etc.
00149 www_decode(path_string);
00151 /* Decode the parts of the path_string into a vector. The first component of the
00152    vector will be empty if the path_string starts with '/'. 
00153    We also refuse to append empty strings to path_. This handles cases like
00154    "/a//b" which will result in <"","a","b">.
00156   We use n0 to indicate the start position of a component and
00157   n1 to indicate the position just after the end of the component.
00158 */
00159 string::size_type n0(path_string.find_first_not_of('/'));
00161 if (n0!=0) // the path starts with '/' which we encode as a first "/" component.
00162   path_.push_back("");
00164 if (n0==string::npos) // path_string must be "/"
00165   return true;
00167 string::size_type n1(string::npos);
00169 /* The variable n0 points to the start of a component. We make n1 point to
00170    the next '/', if any. The component then is the substring starting at
00171    n0 and ending before n1.
00172 */
00173 while ((n1=path_string.find_first_of('/',n0))!=string::npos) {
00174   if (n1>n0) // only non-empty parts are stored.
00175     path_.push_back(path_string.substr(n0,n1-n0));
00176   n0 = n1+1;
00177   }
00178 // If path_string does not end with '/', we must still add the final component.
00179 if (n0<path_string.size())
00180   path_.push_back(path_string.substr(n0));
00181 return true;
00182 }
00184 // Retrieve and store domain information from a line from a log file.
00185 bool
00186 LogRecord::parse_domain(const string& line) {
00187 /* The domain informatin is easy to find: it's the first part of
00188    a line, followed by a ' ', as illustrated in the following example.
00190 - - [12/Feb/2000:13:18:49 +0100] ...
00192 */
00193 string::size_type domain_start(0);
00194 string::size_type domain_end(line.find_first_of(' '));
00195 if (domain_end==string::npos)
00196   return false;
00197 string domain_string(line,domain_start,domain_end);
00199 string::size_type n0(domain_string.find_first_not_of('.'));
00200 if (n0!=0)
00201   return false;
00202 string::size_type n1(string::npos);
00204 while ((n1 = domain_string.find_first_of('.',n0)) != string::npos) {
00205   if (n1>n0)
00206     domain_.push_back(domain_string.substr(n0,n1-n0));
00207   n0 = n1+1;
00208   }
00209 // If domain does not end with '.', we still need to add a final component.
00210 if (n0<domain_string.size()) 
00211   domain_.push_back(domain_string.substr(n0));
00213 return true;
00214 }

httpstats-stage01 [ 7 April, 2001]