root/src/log.cc

Revision 5, 9.0 KB (checked in by nick, 3 years ago)
Line 
1#include "apachetop.h"
2
3#include "inlines.cc"
4
5#define RESOLVING_STRING "..."
6#define NO_RESOLVED_INFO "?"
7
8extern map *um, /* urlmap */
9           *im, /* ipmap */
10           *hm, /* hostmap */
11           *rm, /* referrermap */
12           *fm; /* filemap */
13
14extern time_t now;
15extern config cf;
16
17extern Circle *c;
18
19extern Queue want_host, want_ip;
20
21#if HAVE_ADNS_H
22extern adns_state adns;
23#endif
24
25/* CommonLogParser handles common and combined, despite its name */
26int CommonLogParser::parse(char *logline, struct logbits *b)
27{
28        char *bufsp, *bufcp, *ptr;
29        char *workptr;
30
31        struct sockaddr_in addr;
32
33        bufsp = logline;
34
35        /* host first */
36        bufcp = strchr(logline, ' ');
37        if (!bufcp)
38                return -1;
39       
40        *bufcp = (char) NULL;
41        ++bufcp;
42
43        /* quickly figure out if this is an IP or a host. We do this by
44         * checking each character of it; if every character is either a
45         * digit or a dot, then it's an IP (no host can just be digits)
46        */
47        for(workptr = bufsp ; *workptr ; workptr++)
48        {
49                if (isdigit(*workptr)) continue;
50                if (*workptr == '.') continue;
51
52                /* it's neither a digit or a dot */
53                break;
54        }
55
56        ptr = bufsp;
57        if (*workptr)
58        {
59                /* it is a hostname */
60
61                /* insert will return existing position if it exists */
62                b->host_pos = hm->insert(ptr);
63                b->host_hash = TTHash(ptr);
64                b->want_host = false; /* cos we have it */
65
66#if HAVE_ADNS_H
67                if (cf.do_resolving)
68                {
69                        b->want_ip = true;
70                       
71                        dprintf("lookup %s\n", ptr);
72                        /* fire off a query with adns */
73                        b->dns_query = new adns_query;
74                        adns_submit(adns, ptr, adns_r_a,
75                            (adns_queryflags) NULL, NULL, b->dns_query);
76
77                        b->ip_pos = im->insert(RESOLVING_STRING);
78                        b->ip_hash = TTHash(RESOLVING_STRING);
79                }
80                else
81#endif /* HAVE_ADNS_H */
82                {
83                        /* don't resolve the IP, and use -1 which means
84                         * "there is nothing of interest here" */
85                        b->ip_pos = -1;
86                        b->want_ip = false;
87                }
88        }
89        else
90        {
91                /* it is an IP */
92
93                b->ip_pos = im->insert(ptr);
94                b->ip_hash = TTHash(ptr);
95                b->want_ip = false; /* we have the IP already */
96
97#if HAVE_ADNS_H
98                if (cf.do_resolving)
99                {
100
101                        /* this is so we'll get a display like
102                           ..resolving.. [212.13.201.101]
103                           then once resolved:
104                           clueful.shagged.org [212.13.201.101]
105                        */
106                        b->host_pos = hm->insert(RESOLVING_STRING);
107                        b->host_hash = TTHash(RESOLVING_STRING);
108
109                        b->want_host = true; /* we're going to get this */
110
111                        /* construct network byte order num
112                        ** for adns_submit_reverse
113                        */
114                        addr.sin_family = AF_INET;
115                        addr.sin_addr.s_addr = inet_addr(ptr);
116
117                        b->dns_query = new adns_query;
118                        adns_submit_reverse(adns, (struct sockaddr *)&addr,
119                            adns_r_ptr, (adns_queryflags)adns_qf_owner,
120                            NULL, b->dns_query);
121                }
122                else
123#endif /* HAVE_ADNS_H */
124                {
125                        /* don't resolve the host, use the IP */
126                        b->host_pos = hm->insert(ptr);
127                        b->host_hash = TTHash(ptr);
128                        b->want_host = false; /* we are not resolving */
129                }
130        }
131
132        /* now skip to date */
133        if (!(bufcp = strchr(bufcp, '[')))
134                return -1;
135
136        bufcp++;
137
138        b->time = now; /* be lazy */
139
140        bufcp += 29; /* from dayofmonth to first char of method */
141
142        /* URL. processURL() will update bufcp to point at the end so we can
143         * continue processing from there */
144        if ((ptr = this->processURL(&bufcp)) == NULL)
145                return -1;
146
147        /* get url_pos for this url; for circle_struct (c) later */
148        b->url_pos = um->insert(ptr);
149        b->url_hash = TTHash(ptr);
150
151        /* return code */
152        b->retcode = atoi(bufcp);
153        bufcp += 4;
154
155        /* bytecount */
156        b->bytes = atoi(bufcp);
157
158
159        /* this may be the end of the line if it's a common log; if
160         * it's combined then we have referrer and user agent left */
161        if (!(bufsp = strchr(bufcp, '"')))
162        {
163                /* nothing left, its common */
164               
165                /* fill in a dummy value for referrer map */
166                b->ref_pos = rm->insert("Unknown");
167                return 0;
168        }
169
170        bufsp += 1; /* skip to first character of referrer */
171
172        /* find the end of referrer and null it */
173        if (!(bufcp = strchr(bufsp, '"')))
174                return -1;
175        *bufcp = (char) NULL;
176
177        /* unless they want to keep it, skip over the protocol, ie http:// */
178        if ((cf.preserve_ref_protocol == 0) && (bufcp = strstr(bufsp, "://")))
179                bufsp = bufcp + 3;
180       
181
182        /* we could munge the referrer now; cut down the path elements,
183         * remove querystring, but we'll leave that for a later date */
184
185//      b->referrer = bufsp;
186
187        /* get ref_pos for this url; for circle_struct (c) later */
188        b->ref_pos = rm->insert(bufsp);
189        b->ref_hash = TTHash(bufsp);
190
191        /* user-agent is as yet unused */
192
193        return 0;
194}
195
196
197int AtopLogParser::parse(char *logline, struct logbits *b)
198{
199        return 0;
200}
201
202
203/* generic parser helper functions */
204
205char *LogParser::processURL(char **buf) /* {{{ */
206{
207        char *bufcp, *realstart, *endptr;
208        int length;
209
210        bufcp = *buf;
211
212        /* this skips past the method */
213        if (!(bufcp = strchr(bufcp, ' ')) )
214                return NULL;
215        ++bufcp; // skip space
216
217        realstart = bufcp;
218
219        /* find the end of url; locate a protocol, out of the following list */
220        if (
221            !(endptr = strstr(bufcp, " HTTP/"))
222#if WITH_REAL_PROTOCOLS
223            /* v0.12: RealServer logs are very similar to Apache's,
224             * so we can support those too! Cool! */
225            && !(endptr = strstr(bufcp, " RTSP/")) /* RealStreaming UDP */
226            && !(endptr = strstr(bufcp, " RTSPT/")) /* RealStreaming TCP */
227            && !(endptr = strstr(bufcp, " RTSPH/")) /* RealStreaming HTTP */
228#endif
229           )
230                return NULL;
231
232        /* null the space in front of it */
233        *endptr = (char) NULL;
234
235        /* TODO maybe we can use the protocol someday.. */
236
237
238        /* this is all mungeURL is interested in */
239        length = endptr - realstart;
240
241        /* now find the finishing ", so parse* can deal with rest of line */
242        if (!(endptr = strstr(endptr+1, "\" ")))
243                return NULL;
244
245        mungeURL(&realstart, &length);
246       
247        /* feed back where the end of the URL is */
248        *buf = endptr+2;
249
250        return realstart;
251} /* }}} */
252
253/* munge the url passed in *url inplace;
254 * *length is the original length, and we update it once we're done */
255int LogParser::mungeURL(char **url, int *length) /* {{{ */
256{
257        int skipped = 0;
258        char *bufcp, *endptr, *workptr;
259
260        endptr = *url + *length;
261        *endptr = (char) NULL;
262
263        /* do we want to keep the query string? */
264        if (!cf.keep_querystring)
265        {
266                /* null the first ? or & - anything after
267                 * it is unrequired; it's the querystring */
268                if ((workptr = strchr(*url, '?')) ||
269                    (workptr = strchr(*url, '&')) )
270                {
271                        /* we might have overrun the end of the real URL and
272                         * gone into referrer or something. Check that. */
273                        if (workptr < endptr)
274                        {
275                                /* we're ok */
276                                *workptr = (char) NULL;
277                                bufcp = workptr+1;
278                        }
279                }
280        }
281
282        /* how many path segments of the url are we keeping? */
283        if (cf.keep_segments > 0)
284        {
285                /* given a path of /foo/bar/moo/ and a keep_segments of 2,
286                 * we want the / after the second element */
287
288                bufcp = workptr = *url + 1; /* skip leading / */
289
290                //dprintf("workptr is %s\n", workptr);
291
292                /* now skip the next keep_segments slashes */
293                while (skipped < cf.keep_segments && workptr < endptr)
294                {
295                        workptr++;
296
297                        if (*workptr == '/')
298                        {
299                                /* discovered a slash */
300                                skipped++;
301
302                                /* bufcp becomes the char after / */
303                                bufcp = workptr+1;
304                        }
305
306                        /* if we hit the end before finding the right number
307                         * of slashes, we just keep it all */
308                        if (workptr == endptr)
309                                bufcp = workptr;
310                }
311                *bufcp = (char) NULL;
312        }
313
314
315        /* do we want to lowercase it all? */
316        if (cf.lowercase_urls)
317        {
318                workptr = *url;
319                while(workptr < endptr)
320                {
321                        *workptr = tolower(*workptr);
322                        workptr++;
323                }
324        }
325
326        /* fin */
327
328        return 0;
329} /* }}} */
330
331#if HAVE_ADNS_H
332/* adns; check to see if any queries have returned, and populate the circle
333 * as required. Be careful of any circle entries that have expired since
334 * the query was started. */
335void collect_dns_responses()
336{
337        int err;
338        struct logbits *lb;
339        adns_answer *answer;
340        int got_host = false, got_ip = false;
341
342        /* check every circle entry that has want_host or want_ip */
343
344        while(c->walk(&lb) != -1)
345        {
346                if (lb->want_host == false && lb->want_ip == false)
347                        continue;
348
349//              dprintf("adns_check for %p\n", lb);
350                /* this circle slot has an outstanding query */
351                err = adns_check(adns, lb->dns_query, &answer, NULL);
352
353                if (err == EAGAIN)
354                {
355                        /* still waiting */
356                        continue;
357                }
358
359                /* some form of reply. Be it success or error, this query is
360                 * now done. */
361
362                got_host = lb->want_host;
363                got_ip = lb->want_ip;
364
365                lb->want_host = false;
366                lb->want_ip = false;
367                delete lb->dns_query;
368
369                if (answer->status == adns_s_ok)
370                {
371                        /* we have a reply */
372        //              dprintf("got a reply\n");
373                        if (got_host)
374                        {
375                                /* we'll have this new host in the hostmap ta */
376                                lb->host_pos = hm->insert(*answer->rrs.str);
377                                lb->host_hash = TTHash(*answer->rrs.str);
378                        }
379                        else if (got_ip)
380                        {
381                                /* put the IP into the ipmap */
382                                lb->ip_pos =
383                                    im->insert(inet_ntoa(*answer->rrs.inaddr));
384                                lb->ip_hash =
385                                    TTHash(inet_ntoa(*answer->rrs.inaddr));
386                        }
387
388                        free(answer);
389                        continue;
390                }
391
392                /* assume this IP has no reverse info; so we'll put the IP
393                 * into Host as well; this is so that the Host list will be
394                 * maintained properly (if we just put ? into Host, then
395                 * they bunch up together)
396                */
397
398                lb->host_pos = hm->insert(im->reverse(lb->ip_pos));
399                lb->host_hash = TTHash(im->reverse(lb->ip_pos));
400                free(answer);
401                continue;
402        }
403}
404#endif /* HAVE_ADNS_H */
Note: See TracBrowser for help on using the browser.