| 1 | #include "apachetop.h" |
|---|
| 2 | |
|---|
| 3 | #include "inlines.cc" |
|---|
| 4 | |
|---|
| 5 | #define RESOLVING_STRING "..." |
|---|
| 6 | #define NO_RESOLVED_INFO "?" |
|---|
| 7 | |
|---|
| 8 | extern map *um, /* urlmap */ |
|---|
| 9 | *im, /* ipmap */ |
|---|
| 10 | *hm, /* hostmap */ |
|---|
| 11 | *rm, /* referrermap */ |
|---|
| 12 | *fm; /* filemap */ |
|---|
| 13 | |
|---|
| 14 | extern time_t now; |
|---|
| 15 | extern config cf; |
|---|
| 16 | |
|---|
| 17 | extern Circle *c; |
|---|
| 18 | |
|---|
| 19 | extern Queue want_host, want_ip; |
|---|
| 20 | |
|---|
| 21 | #if HAVE_ADNS_H |
|---|
| 22 | extern adns_state adns; |
|---|
| 23 | #endif |
|---|
| 24 | |
|---|
| 25 | /* CommonLogParser handles common and combined, despite its name */ |
|---|
| 26 | int CommonLogParser::parse(char *logline, struct logbits *b) |
|---|
| 27 | { |
|---|
| 28 | char *bufsp, *bufcp, *ptr; |
|---|
| 29 | char *workptr; |
|---|
| 30 | |
|---|
| 31 | struct sockaddr_in addr; |
|---|
| 32 | |
|---|
| 33 | bufsp = logline; |
|---|
| 34 | |
|---|
| 35 | /* host first */ |
|---|
| 36 | bufcp = strchr(logline, ' '); |
|---|
| 37 | if (!bufcp) |
|---|
| 38 | return -1; |
|---|
| 39 | |
|---|
| 40 | *bufcp = (char) NULL; |
|---|
| 41 | ++bufcp; |
|---|
| 42 | |
|---|
| 43 | /* quickly figure out if this is an IP or a host. We do this by |
|---|
| 44 | * checking each character of it; if every character is either a |
|---|
| 45 | * digit or a dot, then it's an IP (no host can just be digits) |
|---|
| 46 | */ |
|---|
| 47 | for(workptr = bufsp ; *workptr ; workptr++) |
|---|
| 48 | { |
|---|
| 49 | if (isdigit(*workptr)) continue; |
|---|
| 50 | if (*workptr == '.') continue; |
|---|
| 51 | |
|---|
| 52 | /* it's neither a digit or a dot */ |
|---|
| 53 | break; |
|---|
| 54 | } |
|---|
| 55 | |
|---|
| 56 | ptr = bufsp; |
|---|
| 57 | if (*workptr) |
|---|
| 58 | { |
|---|
| 59 | /* it is a hostname */ |
|---|
| 60 | |
|---|
| 61 | /* insert will return existing position if it exists */ |
|---|
| 62 | b->host_pos = hm->insert(ptr); |
|---|
| 63 | b->host_hash = TTHash(ptr); |
|---|
| 64 | b->want_host = false; /* cos we have it */ |
|---|
| 65 | |
|---|
| 66 | #if HAVE_ADNS_H |
|---|
| 67 | if (cf.do_resolving) |
|---|
| 68 | { |
|---|
| 69 | b->want_ip = true; |
|---|
| 70 | |
|---|
| 71 | dprintf("lookup %s\n", ptr); |
|---|
| 72 | /* fire off a query with adns */ |
|---|
| 73 | b->dns_query = new adns_query; |
|---|
| 74 | adns_submit(adns, ptr, adns_r_a, |
|---|
| 75 | (adns_queryflags) NULL, NULL, b->dns_query); |
|---|
| 76 | |
|---|
| 77 | b->ip_pos = im->insert(RESOLVING_STRING); |
|---|
| 78 | b->ip_hash = TTHash(RESOLVING_STRING); |
|---|
| 79 | } |
|---|
| 80 | else |
|---|
| 81 | #endif /* HAVE_ADNS_H */ |
|---|
| 82 | { |
|---|
| 83 | /* don't resolve the IP, and use -1 which means |
|---|
| 84 | * "there is nothing of interest here" */ |
|---|
| 85 | b->ip_pos = -1; |
|---|
| 86 | b->want_ip = false; |
|---|
| 87 | } |
|---|
| 88 | } |
|---|
| 89 | else |
|---|
| 90 | { |
|---|
| 91 | /* it is an IP */ |
|---|
| 92 | |
|---|
| 93 | b->ip_pos = im->insert(ptr); |
|---|
| 94 | b->ip_hash = TTHash(ptr); |
|---|
| 95 | b->want_ip = false; /* we have the IP already */ |
|---|
| 96 | |
|---|
| 97 | #if HAVE_ADNS_H |
|---|
| 98 | if (cf.do_resolving) |
|---|
| 99 | { |
|---|
| 100 | |
|---|
| 101 | /* this is so we'll get a display like |
|---|
| 102 | ..resolving.. [212.13.201.101] |
|---|
| 103 | then once resolved: |
|---|
| 104 | clueful.shagged.org [212.13.201.101] |
|---|
| 105 | */ |
|---|
| 106 | b->host_pos = hm->insert(RESOLVING_STRING); |
|---|
| 107 | b->host_hash = TTHash(RESOLVING_STRING); |
|---|
| 108 | |
|---|
| 109 | b->want_host = true; /* we're going to get this */ |
|---|
| 110 | |
|---|
| 111 | /* construct network byte order num |
|---|
| 112 | ** for adns_submit_reverse |
|---|
| 113 | */ |
|---|
| 114 | addr.sin_family = AF_INET; |
|---|
| 115 | addr.sin_addr.s_addr = inet_addr(ptr); |
|---|
| 116 | |
|---|
| 117 | b->dns_query = new adns_query; |
|---|
| 118 | adns_submit_reverse(adns, (struct sockaddr *)&addr, |
|---|
| 119 | adns_r_ptr, (adns_queryflags)adns_qf_owner, |
|---|
| 120 | NULL, b->dns_query); |
|---|
| 121 | } |
|---|
| 122 | else |
|---|
| 123 | #endif /* HAVE_ADNS_H */ |
|---|
| 124 | { |
|---|
| 125 | /* don't resolve the host, use the IP */ |
|---|
| 126 | b->host_pos = hm->insert(ptr); |
|---|
| 127 | b->host_hash = TTHash(ptr); |
|---|
| 128 | b->want_host = false; /* we are not resolving */ |
|---|
| 129 | } |
|---|
| 130 | } |
|---|
| 131 | |
|---|
| 132 | /* now skip to date */ |
|---|
| 133 | if (!(bufcp = strchr(bufcp, '['))) |
|---|
| 134 | return -1; |
|---|
| 135 | |
|---|
| 136 | bufcp++; |
|---|
| 137 | |
|---|
| 138 | b->time = now; /* be lazy */ |
|---|
| 139 | |
|---|
| 140 | bufcp += 29; /* from dayofmonth to first char of method */ |
|---|
| 141 | |
|---|
| 142 | /* URL. processURL() will update bufcp to point at the end so we can |
|---|
| 143 | * continue processing from there */ |
|---|
| 144 | if ((ptr = this->processURL(&bufcp)) == NULL) |
|---|
| 145 | return -1; |
|---|
| 146 | |
|---|
| 147 | /* get url_pos for this url; for circle_struct (c) later */ |
|---|
| 148 | b->url_pos = um->insert(ptr); |
|---|
| 149 | b->url_hash = TTHash(ptr); |
|---|
| 150 | |
|---|
| 151 | /* return code */ |
|---|
| 152 | b->retcode = atoi(bufcp); |
|---|
| 153 | bufcp += 4; |
|---|
| 154 | |
|---|
| 155 | /* bytecount */ |
|---|
| 156 | b->bytes = atoi(bufcp); |
|---|
| 157 | |
|---|
| 158 | |
|---|
| 159 | /* this may be the end of the line if it's a common log; if |
|---|
| 160 | * it's combined then we have referrer and user agent left */ |
|---|
| 161 | if (!(bufsp = strchr(bufcp, '"'))) |
|---|
| 162 | { |
|---|
| 163 | /* nothing left, its common */ |
|---|
| 164 | |
|---|
| 165 | /* fill in a dummy value for referrer map */ |
|---|
| 166 | b->ref_pos = rm->insert("Unknown"); |
|---|
| 167 | return 0; |
|---|
| 168 | } |
|---|
| 169 | |
|---|
| 170 | bufsp += 1; /* skip to first character of referrer */ |
|---|
| 171 | |
|---|
| 172 | /* find the end of referrer and null it */ |
|---|
| 173 | if (!(bufcp = strchr(bufsp, '"'))) |
|---|
| 174 | return -1; |
|---|
| 175 | *bufcp = (char) NULL; |
|---|
| 176 | |
|---|
| 177 | /* unless they want to keep it, skip over the protocol, ie http:// */ |
|---|
| 178 | if ((cf.preserve_ref_protocol == 0) && (bufcp = strstr(bufsp, "://"))) |
|---|
| 179 | bufsp = bufcp + 3; |
|---|
| 180 | |
|---|
| 181 | |
|---|
| 182 | /* we could munge the referrer now; cut down the path elements, |
|---|
| 183 | * remove querystring, but we'll leave that for a later date */ |
|---|
| 184 | |
|---|
| 185 | // b->referrer = bufsp; |
|---|
| 186 | |
|---|
| 187 | /* get ref_pos for this url; for circle_struct (c) later */ |
|---|
| 188 | b->ref_pos = rm->insert(bufsp); |
|---|
| 189 | b->ref_hash = TTHash(bufsp); |
|---|
| 190 | |
|---|
| 191 | /* user-agent is as yet unused */ |
|---|
| 192 | |
|---|
| 193 | return 0; |
|---|
| 194 | } |
|---|
| 195 | |
|---|
| 196 | |
|---|
| 197 | int AtopLogParser::parse(char *logline, struct logbits *b) |
|---|
| 198 | { |
|---|
| 199 | return 0; |
|---|
| 200 | } |
|---|
| 201 | |
|---|
| 202 | |
|---|
| 203 | /* generic parser helper functions */ |
|---|
| 204 | |
|---|
| 205 | char *LogParser::processURL(char **buf) /* {{{ */ |
|---|
| 206 | { |
|---|
| 207 | char *bufcp, *realstart, *endptr; |
|---|
| 208 | int length; |
|---|
| 209 | |
|---|
| 210 | bufcp = *buf; |
|---|
| 211 | |
|---|
| 212 | /* this skips past the method */ |
|---|
| 213 | if (!(bufcp = strchr(bufcp, ' ')) ) |
|---|
| 214 | return NULL; |
|---|
| 215 | ++bufcp; // skip space |
|---|
| 216 | |
|---|
| 217 | realstart = bufcp; |
|---|
| 218 | |
|---|
| 219 | /* find the end of url; locate a protocol, out of the following list */ |
|---|
| 220 | if ( |
|---|
| 221 | !(endptr = strstr(bufcp, " HTTP/")) |
|---|
| 222 | #if WITH_REAL_PROTOCOLS |
|---|
| 223 | /* v0.12: RealServer logs are very similar to Apache's, |
|---|
| 224 | * so we can support those too! Cool! */ |
|---|
| 225 | && !(endptr = strstr(bufcp, " RTSP/")) /* RealStreaming UDP */ |
|---|
| 226 | && !(endptr = strstr(bufcp, " RTSPT/")) /* RealStreaming TCP */ |
|---|
| 227 | && !(endptr = strstr(bufcp, " RTSPH/")) /* RealStreaming HTTP */ |
|---|
| 228 | #endif |
|---|
| 229 | ) |
|---|
| 230 | return NULL; |
|---|
| 231 | |
|---|
| 232 | /* null the space in front of it */ |
|---|
| 233 | *endptr = (char) NULL; |
|---|
| 234 | |
|---|
| 235 | /* TODO maybe we can use the protocol someday.. */ |
|---|
| 236 | |
|---|
| 237 | |
|---|
| 238 | /* this is all mungeURL is interested in */ |
|---|
| 239 | length = endptr - realstart; |
|---|
| 240 | |
|---|
| 241 | /* now find the finishing ", so parse* can deal with rest of line */ |
|---|
| 242 | if (!(endptr = strstr(endptr+1, "\" "))) |
|---|
| 243 | return NULL; |
|---|
| 244 | |
|---|
| 245 | mungeURL(&realstart, &length); |
|---|
| 246 | |
|---|
| 247 | /* feed back where the end of the URL is */ |
|---|
| 248 | *buf = endptr+2; |
|---|
| 249 | |
|---|
| 250 | return realstart; |
|---|
| 251 | } /* }}} */ |
|---|
| 252 | |
|---|
| 253 | /* munge the url passed in *url inplace; |
|---|
| 254 | * *length is the original length, and we update it once we're done */ |
|---|
| 255 | int LogParser::mungeURL(char **url, int *length) /* {{{ */ |
|---|
| 256 | { |
|---|
| 257 | int skipped = 0; |
|---|
| 258 | char *bufcp, *endptr, *workptr; |
|---|
| 259 | |
|---|
| 260 | endptr = *url + *length; |
|---|
| 261 | *endptr = (char) NULL; |
|---|
| 262 | |
|---|
| 263 | /* do we want to keep the query string? */ |
|---|
| 264 | if (!cf.keep_querystring) |
|---|
| 265 | { |
|---|
| 266 | /* null the first ? or & - anything after |
|---|
| 267 | * it is unrequired; it's the querystring */ |
|---|
| 268 | if ((workptr = strchr(*url, '?')) || |
|---|
| 269 | (workptr = strchr(*url, '&')) ) |
|---|
| 270 | { |
|---|
| 271 | /* we might have overrun the end of the real URL and |
|---|
| 272 | * gone into referrer or something. Check that. */ |
|---|
| 273 | if (workptr < endptr) |
|---|
| 274 | { |
|---|
| 275 | /* we're ok */ |
|---|
| 276 | *workptr = (char) NULL; |
|---|
| 277 | bufcp = workptr+1; |
|---|
| 278 | } |
|---|
| 279 | } |
|---|
| 280 | } |
|---|
| 281 | |
|---|
| 282 | /* how many path segments of the url are we keeping? */ |
|---|
| 283 | if (cf.keep_segments > 0) |
|---|
| 284 | { |
|---|
| 285 | /* given a path of /foo/bar/moo/ and a keep_segments of 2, |
|---|
| 286 | * we want the / after the second element */ |
|---|
| 287 | |
|---|
| 288 | bufcp = workptr = *url + 1; /* skip leading / */ |
|---|
| 289 | |
|---|
| 290 | //dprintf("workptr is %s\n", workptr); |
|---|
| 291 | |
|---|
| 292 | /* now skip the next keep_segments slashes */ |
|---|
| 293 | while (skipped < cf.keep_segments && workptr < endptr) |
|---|
| 294 | { |
|---|
| 295 | workptr++; |
|---|
| 296 | |
|---|
| 297 | if (*workptr == '/') |
|---|
| 298 | { |
|---|
| 299 | /* discovered a slash */ |
|---|
| 300 | skipped++; |
|---|
| 301 | |
|---|
| 302 | /* bufcp becomes the char after / */ |
|---|
| 303 | bufcp = workptr+1; |
|---|
| 304 | } |
|---|
| 305 | |
|---|
| 306 | /* if we hit the end before finding the right number |
|---|
| 307 | * of slashes, we just keep it all */ |
|---|
| 308 | if (workptr == endptr) |
|---|
| 309 | bufcp = workptr; |
|---|
| 310 | } |
|---|
| 311 | *bufcp = (char) NULL; |
|---|
| 312 | } |
|---|
| 313 | |
|---|
| 314 | |
|---|
| 315 | /* do we want to lowercase it all? */ |
|---|
| 316 | if (cf.lowercase_urls) |
|---|
| 317 | { |
|---|
| 318 | workptr = *url; |
|---|
| 319 | while(workptr < endptr) |
|---|
| 320 | { |
|---|
| 321 | *workptr = tolower(*workptr); |
|---|
| 322 | workptr++; |
|---|
| 323 | } |
|---|
| 324 | } |
|---|
| 325 | |
|---|
| 326 | /* fin */ |
|---|
| 327 | |
|---|
| 328 | return 0; |
|---|
| 329 | } /* }}} */ |
|---|
| 330 | |
|---|
| 331 | #if HAVE_ADNS_H |
|---|
| 332 | /* adns; check to see if any queries have returned, and populate the circle |
|---|
| 333 | * as required. Be careful of any circle entries that have expired since |
|---|
| 334 | * the query was started. */ |
|---|
| 335 | void collect_dns_responses() |
|---|
| 336 | { |
|---|
| 337 | int err; |
|---|
| 338 | struct logbits *lb; |
|---|
| 339 | adns_answer *answer; |
|---|
| 340 | int got_host = false, got_ip = false; |
|---|
| 341 | |
|---|
| 342 | /* check every circle entry that has want_host or want_ip */ |
|---|
| 343 | |
|---|
| 344 | while(c->walk(&lb) != -1) |
|---|
| 345 | { |
|---|
| 346 | if (lb->want_host == false && lb->want_ip == false) |
|---|
| 347 | continue; |
|---|
| 348 | |
|---|
| 349 | // dprintf("adns_check for %p\n", lb); |
|---|
| 350 | /* this circle slot has an outstanding query */ |
|---|
| 351 | err = adns_check(adns, lb->dns_query, &answer, NULL); |
|---|
| 352 | |
|---|
| 353 | if (err == EAGAIN) |
|---|
| 354 | { |
|---|
| 355 | /* still waiting */ |
|---|
| 356 | continue; |
|---|
| 357 | } |
|---|
| 358 | |
|---|
| 359 | /* some form of reply. Be it success or error, this query is |
|---|
| 360 | * now done. */ |
|---|
| 361 | |
|---|
| 362 | got_host = lb->want_host; |
|---|
| 363 | got_ip = lb->want_ip; |
|---|
| 364 | |
|---|
| 365 | lb->want_host = false; |
|---|
| 366 | lb->want_ip = false; |
|---|
| 367 | delete lb->dns_query; |
|---|
| 368 | |
|---|
| 369 | if (answer->status == adns_s_ok) |
|---|
| 370 | { |
|---|
| 371 | /* we have a reply */ |
|---|
| 372 | // dprintf("got a reply\n"); |
|---|
| 373 | if (got_host) |
|---|
| 374 | { |
|---|
| 375 | /* we'll have this new host in the hostmap ta */ |
|---|
| 376 | lb->host_pos = hm->insert(*answer->rrs.str); |
|---|
| 377 | lb->host_hash = TTHash(*answer->rrs.str); |
|---|
| 378 | } |
|---|
| 379 | else if (got_ip) |
|---|
| 380 | { |
|---|
| 381 | /* put the IP into the ipmap */ |
|---|
| 382 | lb->ip_pos = |
|---|
| 383 | im->insert(inet_ntoa(*answer->rrs.inaddr)); |
|---|
| 384 | lb->ip_hash = |
|---|
| 385 | TTHash(inet_ntoa(*answer->rrs.inaddr)); |
|---|
| 386 | } |
|---|
| 387 | |
|---|
| 388 | free(answer); |
|---|
| 389 | continue; |
|---|
| 390 | } |
|---|
| 391 | |
|---|
| 392 | /* assume this IP has no reverse info; so we'll put the IP |
|---|
| 393 | * into Host as well; this is so that the Host list will be |
|---|
| 394 | * maintained properly (if we just put ? into Host, then |
|---|
| 395 | * they bunch up together) |
|---|
| 396 | */ |
|---|
| 397 | |
|---|
| 398 | lb->host_pos = hm->insert(im->reverse(lb->ip_pos)); |
|---|
| 399 | lb->host_hash = TTHash(im->reverse(lb->ip_pos)); |
|---|
| 400 | free(answer); |
|---|
| 401 | continue; |
|---|
| 402 | } |
|---|
| 403 | } |
|---|
| 404 | #endif /* HAVE_ADNS_H */ |
|---|