httpstats-final

00001 // $Id: logrecord.C,v 1.2 2001/04/12 14:55:57 dvermeir Exp $
00002 
00003 #include <string>
00004 #include <ctype.h> // isxdigit, isalnum
00005 #include "logrecord.h"
00006 #include "date.h"
00007 
00008 // Retrieve and store relevant information from a line from a log file.
00009 bool
00010 LogRecord::parse_line(const string& line) {
00011 /* A parse succeeds if all components can be parsed.
00012    This should be modified so that we only parse what is necessary,
00013    according to the configuration. But for now, we are too lazy.
00014 */
00015 return parse_date(line) && parse_path(line) && parse_domain(line);
00016 }
00017 
00018 // Retrieve and store date information from a line from a log file.
00019 bool
00020 LogRecord::parse_date(const string& line) {
00021 /* Select date part: just after first '[' up to first folloing ' '
00022    e.g.
00023 
00024         spider1.tiscalinet.be - - [02/Nov/2000:10:20:36 +0100] ..
00025 
00026    will select
00027 
00028         02/Nov/2000:10:20:36
00029 */
00030 string::size_type date_start(line.find_first_of('['));
00031 string::size_type date_end(line.find_first_of(' ',date_start));
00032 string            date_string(line,date_start+1,date_end - date_start-1);
00033 
00034 /* Make date_string acceptable to the Date::Date(const char*) parser.
00035 
00036    1. replace '/' by '-'. In the example, this will yield 
00037 
00038         02-Nov-2000:10:20:36
00039 */
00040 for (string::size_type i=0; i<date_string.size(); ++i)
00041   if (date_string[i]=='/')
00042     date_string[i] = '-';
00043 /*
00044    2. Replace first ':' by space. In the example, this will yield 
00045 
00046         02-Nov-2000 10:20:36
00047 */
00048 date_string.replace(date_string.find_first_of(':'),1," ");
00049 
00050 /* Now parse using Date::Date(const string&). It will throw an exception
00051    if the date cannot be parsed. In that case, we catch the exception
00052    and return false.
00053 */
00054 try {
00055   Date  d(date_string);
00056   date_.set(d.year(),d.month(),d.day(),d.hours());
00057   }
00058 catch (exception& e) {
00059   cerr << "LogRecord::parse_date failed: " << e.what() << endl;
00060   return false;
00061   }
00062 return true;
00063 }
00064 
00066 
00067 // Return value of hex digit.
00068 static inline int 
00069 hexdigit(char c) { // assert(isxdigit(c))
00070 if (isdigit(c)) 
00071   return c - '0';
00072 else if (isupper(c))
00073   return c - 'A' + 10;
00074 else // must be lower case letter
00075   return c - 'a' +10;
00076 }
00077 
00078 // In-place decoding of encoded url, see RFC 1738.
00079 void
00080 www_decode(string& s) {
00081 unsigned int i(0); // input index
00082 unsigned int j(0); // output index
00083 unsigned int len(s.size());
00084 for (i=0; (i<len); ++i,++j)
00085   switch (s[i]) {
00086     // case '+': s[j] = ' '; break; // Apparently not used in the log file.
00087     case '%': // the following 2 characters must be hex digits: 0-9,a-f
00088       if ((i+2<len) && isxdigit(s[i+1]) && isxdigit(s[i+2])) {
00089         s[j] = hexdigit(s[i+1])* 16 + hexdigit(s[i+2]);
00090         i += 2;
00091         }
00092       else
00093         s[j] = s[i];
00094       break;
00095     default: s[j] = s[i]; break;
00096     }
00097 s.resize(j);
00098 }
00100 
00101 // Retrieve and store path information from a line from a log file.
00102 bool
00103 LogRecord::parse_path(const string& line) {
00104 /* The path part can be found between the first and second occurrences
00105    of `"' (double quote). An example is shown below.
00106 
00107     "GET /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00108 
00109    This will result in 
00110 
00111      _path = <"","ssl","kiesoos.cgi">
00112 */
00113 string::size_type path_start(line.find_first_of('\"'));
00114 if (path_start==string::npos)
00115   return false;
00116 /* We are not interested in the verb (e.g. GET), so we skip until after the first ' '.
00117    In the example, path_start would then point to.
00118 
00119     /ssl/kiesoos.cgi?rolnr=58769\&stjcode=5L10021 HTTP/1.0" 
00120 */
00121 path_start = line.find_first_of(' ',path_start);
00122 if (path_start==string::npos)
00123   return false;
00124 ++path_start; // After ' '.
00125 /*  We only want the path until the first '?', '#' or ' '. In the example
00126     this should result in.
00127 
00128     path_string = /ssl/kiesoos.cgi
00129 */
00130 
00131 string::size_type path_end = line.find_first_of("#? ",path_start);
00132 if (path_end==string::npos)
00133   return false;
00134 
00135 if (path_end==path_start) // cannot have an empty path
00136   return false;
00137 
00138 string path_string(line,path_start,path_end - path_start);
00139 
00140 // Decode to translate "%7E" back to '~' etc.
00141 www_decode(path_string);
00142 
00143 return path_.parse(path_string);
00144 }
00145 
00146 // Retrieve and store domain information from a line from a log file.
00147 bool
00148 LogRecord::parse_domain(const string& line) {
00149 /* The domain informatin is easy to find: it's the first part of
00150    a line, followed by a ' ', as illustrated in the following example.
00151 
00152         igwe.vub.ac.be - - [12/Feb/2000:13:18:49 +0100] ...
00153         
00154 */
00155 string::size_type domain_start(0);
00156 string::size_type domain_end(line.find_first_of(' '));
00157 if (domain_end==string::npos)
00158   return false;
00159 string domain_string(line,domain_start,domain_end);
00160 
00161 return domain_.parse(domain_string);
00162 }