httpstats-final

00001 // $Id: logrecord.C,v 1.4 2001/04/12 14:55:55 dvermeir Exp $
00002 
00003 #include <string>
00004 #include <ctype.h> // isxdigit, isalnum
00005 #include "logrecord.h"
00006 #include "date.h"
00007 
00008 // Retrieve and store relevant information from a line from a log file.
00009 bool
00010 LogRecord::parse_line(const string& line) {
00011 /* Clear current vector<string> data members. This is
00012    important because parsing will append to path_ and domain_.
00013 */
00014 path_.clear(); 
00015 domain_.clear();
00016 /* A parse succeeds if all components can be parsed.
00017    This should be modified so that we only parse what is necessary,
00018    according to the configuration. But for now, we are too lazy.
00019 */
00020 return parse_date(line) && parse_path(line) && parse_domain(line);
00021 }
00022 
00023 // Retrieve and store date information from a line from a log file.
00024 bool
00025 LogRecord::parse_date(const string& line) {
00026 /* Select date part: just after first '[' up to first folloing ' '
00027    e.g.
00028 
00029         spider1.tiscalinet.be - - [02/Nov/2000:10:20:36 +0100] ..
00030 
00031    will select
00032 
00033         02/Nov/2000:10:20:36
00034 */
00035 string::size_type date_start(line.find_first_of('['));
00036 string::size_type date_end(line.find_first_of(' ',date_start));
00037 string            date_string(line,date_start+1,date_end - date_start-1);
00038 
00039 /* Make date_string acceptable to the Date::Date(const char*) parser.
00040 
00041    1. replace '/' by '-'. In the example, this will yield 
00042 
00043         02-Nov-2000:10:20:36
00044 */
00045 for (string::size_type i=0; i<date_string.size(); ++i)
00046   if (date_string[i]=='/')
00047     date_string[i] = '-';
00048 /*
00049    2. Replace first ':' by space. In the example, this will yield 
00050 
00051         02-Nov-2000 10:20:36
00052 */
00053 date_string.replace(date_string.find_first_of(':'),1," ");
00054 
00055 /* Now parse using Date::Date(const string&). It will throw an exception
00056    if the date cannot be parsed. In that case, we catch the exception
00057    and return false.
00058 */
00059 try {
00060   Date  date(date_string);
00061   date_ = date;
00062   // We are only interested in hours, not minutes or seconds.
00063   date_.minutes(0);
00064   date_.seconds(0);
00065   }
00066 catch (exception& e) {
00067   cerr << "parse_date: " << e.what() << endl;
00068   return false;
00069   }
00070 return true;
00071 }
00072 
00074 
00075 // Return value of hex digit.
00076 static inline int 
00077 hexdigit(char c) { // assert(isxdigit(c))
00078 if (isdigit(c)) 
00079   return c - '0';
00080 else if (isupper(c))
00081   return c - 'A' + 10;
00082 else // must be lower case letter
00083   return c - 'a' +10;
00084 }
00085 
00086 // In-place decoding of encoded url, see RFC 1738.
00087 void
00088 www_decode(string& s) {
00089 unsigned int i(0); // input index
00090 unsigned int j(0); // output index
00091 unsigned int len(s.size());
00092 for (i=0; (i<len); ++i,++j)
00093   switch (s[i]) {
00094     // case '+': s[j] = ' '; break; // Apparently not used in the log file.
00095     case '%': // the following 2 characters must be hex digits: 0-9,a-f
00096       if ((i+2<len) && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
00097         s[j] = hexdigit(s[i+1])* 16 + hexdigit(s[i+2]);
00098         i += 2;
00099         }
00100       else
00101         s[j] = s[i];
00102       break;
00103     default: s[j] = s[i]; break;
00104     }
00105 s.resize(j);
00106 }
00108 
00109 // Retrieve and store path information from a line from a log file.
00110 bool
00111 LogRecord::parse_path(const string& line) {
00112 /* The path part can be found between the first and second occurrences
00113    of `"' (double quote). An example is shown below.
00114 
00115     "GET /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00116 
00117    This will result in 
00118 
00119      _path = <"","ssl","kiesoos.cgi">
00120 */
00121 string::size_type path_start(line.find_first_of('\"'));
00122 if (path_start==string::npos)
00123   return false;
00124 /* We are not interested in the verb (e.g. GET), so we skip until after the first ' '.
00125    In the example, path_start would then point to.
00126 
00127     /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00128 */
00129 path_start = line.find_first_of(' ',path_start);
00130 if (path_start==string::npos)
00131   return false;
00132 ++path_start; // After ' '.
00133 /*  We only want the path until the first '?', '#' or ' '. In the example
00134     this should result in.
00135 
00136     path_string = /ssl/kiesoos.cgi
00137 */
00138 
00139 string::size_type path_end = line.find_first_of("#? ",path_start);
00140 if (path_end==string::npos)
00141   return false;
00142 
00143 if (path_end==path_start) // cannot have an empty path
00144   return false;
00145 
00146 string path_string(line,path_start,path_end - path_start);
00147 
00148 // Decode to translate "%7E" back to '~' etc.
00149 www_decode(path_string);
00150 
00151 /* Decode the parts of the path_string into a vector. The first component of the
00152    vector will be empty if the path_string starts with '/'. 
00153    We also refuse to append empty strings to path_. This handles cases like
00154    "/a//b" which will result in <"","a","b">.
00155 
00156   We use n0 to indicate the start position of a component and
00157   n1 to indicate the position just after the end of the component.
00158 */
00159 string::size_type n0(path_string.find_first_not_of('/'));
00160 
00161 if (n0!=0) // the path starts with '/' which we encode as a first "/" component.
00162   path_.push_back("");
00163 
00164 if (n0==string::npos) // path_string must be "/"
00165   return true;
00166 
00167 string::size_type n1(string::npos);
00168 
00169 /* The variable n0 points to the start of a component. We make n1 point to
00170    the next '/', if any. The component then is the substring starting at
00171    n0 and ending before n1.
00172 */
00173 while ((n1=path_string.find_first_of('/',n0))!=string::npos) {
00174   if (n1>n0) // only non-empty parts are stored.
00175     path_.push_back(path_string.substr(n0,n1-n0));
00176   n0 = n1+1;
00177   }
00178 // If path_string does not end with '/', we must still add the final component.
00179 if (n0<path_string.size())
00180   path_.push_back(path_string.substr(n0));
00181 return true;
00182 }
00183 
00184 // Retrieve and store domain information from a line from a log file.
00185 bool
00186 LogRecord::parse_domain(const string& line) {
00187 /* The domain informatin is easy to find: it's the first part of
00188    a line, followed by a ' ', as illustrated in the following example.
00189 
00190         igwe.vub.ac.be - - [12/Feb/2000:13:18:49 +0100] ...
00191         
00192 */
00193 string::size_type domain_start(0);
00194 string::size_type domain_end(line.find_first_of(' '));
00195 if (domain_end==string::npos)
00196   return false;
00197 string domain_string(line,domain_start,domain_end);
00198 
00199 string::size_type n0(domain_string.find_first_not_of('.'));
00200 if (n0!=0)
00201   return false;
00202 string::size_type n1(string::npos);
00203 
00204 while ((n1 = domain_string.find_first_of('.',n0)) != string::npos) {
00205   if (n1>n0)
00206     domain_.push_back(domain_string.substr(n0,n1-n0));
00207   n0 = n1+1;
00208   }
00209 // If domain does not end with '.', we still need to add a final component.
00210 if (n0<domain_string.size()) 
00211   domain_.push_back(domain_string.substr(n0));
00212   
00213 return true;
00214 }