#!/usr/bin/env python """peastat - simple live web stats http://www.throwingbeans.org/peastat/ instructions: 1. configure the 'logfile' and 'rooturl' values below 2. upload peastat.py somewhere it can be executed on your web server (e.g. your cgi-bin) 3. make peastat.py executable (set its permissions to 755) """ __version__ = "0.2" __author__ = "Tom Dyson (tomdyson at spamcop dot net)" __copyright__ = "(C) 2005 Tom Dyson. GNU GPL 2." __url__ = 'http://www.throwingbeans.org/peastat/' import cgitb cgitb.enable() import cgi, os, re, time, urllib try: import dbm # anydbm is unreliable... except: import dumbdbm as dbm # start configuring: logfile = "/Users/tomdyson/access_log" # full path to log file rooturl = "http://throwingbeans.org" # root url of site whose logs we're analysing # configure if you want to: minresults = 5 # minimum results to include in overview lastlines = 2000 # number of most recent requests to analyse ispage = re.compile('(/|\.html|\.htm|\.php|\.xml)$').search # requests matching this regex count as pages ignorelines = re.compile('pea\.py').search # ignore lines including this regex recentreferrers = 10 # show this many recent referrers recentsearches = 10 # show this many recent search terms database = "/tmp/peastat.db" # store DNS lookups here # stop configuring url = None; ip = None; atom = False cgiloc = os.environ.get('SCRIPT_NAME', '') request_uri = os.environ.get('REQUEST_URI', '') server_name = os.environ.get('SERVER_NAME', '') apachetoday = time.strftime('%d/%b/%Y') # todo ooh form = cgi.FieldStorage() if form.has_key( "url" ): url = form["url"].value if form.has_key( "ip" ): ip = form["ip"].value if form.has_key( "atom" ): atom = True def justdomain(url): # Return only the domain of a URL try: return url.split('//')[1].split('/')[0] except IndexError: # catch evil referrers return 'bad referrer' thisdomain = justdomain(rooturl) def sortByValue(d): """ Returns the keys of dictionary d sorted by their values """ items=d.items() backitems=[ [v[1],v[0]] for v in items] backitems.sort(); backitems.reverse() return [ backitems[i][1] for i in range(0,len(backitems))] def tailLines(filename,linesback): """python tail - modified from recipe at http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/157035 returns list of [linesback] lines from end of [filename]""" avgcharsperline=150 file = open(filename,'r') while 1: try: file.seek(-1 * avgcharsperline * linesback,2) except IOError: file.seek(0) if file.tell() == 0: atstart=1 else: atstart=0 lines=file.read().split("\n") if (len(lines) > (linesback+1)) or atstart: break #The lines are bigger than we thought avgcharsperline=avgcharsperline * 1.3 #Inc avg for retry file.close() if len(lines) > linesback: start=len(lines)-linesback -1 else: start=0 return lines[start:len(lines)-1] def timeSinceApacheDate(apacheDate): then = time.strptime(apacheDate,'%d/%b/%Y:%H:%M:%S') then = time.mktime(then) now = time.mktime(time.localtime()) minutesSince = (now-then) / 60 hours, minutes = divmod(minutesSince,60) return int(hours), int(minutes) def getDNS(ip): # get the domain name, if we've seen it before try: db = dbm.open(database, "c") if db.has_key(ip): addr = db[ip] else: addr = ip db.close() except: addr = ip return addr def getLogLines(logfile): try: logLines = tailLines(logfile,lastlines) except: # or try system's tail logLines = os.popen('/usr/bin/tail -n ' + str(lastlines) + ' ' + logfile).readlines() if len(logLines) == 0: # can't handle popen exceptions properly raise Exception ('No lines found') return logLines loglines = getLogLines(logfile) def getOverview(): t0 = time.time() overview = {'cgiloc':cgiloc} hits = {} pagecount = 0 overview["totalhits"] = len(loglines) referrers = [] queries = {} timeoffirsthit = loglines[0].split(' ')[3].replace('[','') for line in loglines: resource = line.split(' ')[6] if ispage(resource) and not ignorelines(line): pagecount = pagecount + 1 hits[resource] = hits.get(resource,0) + 1 lastres = resource line = line.replace('\\"','"') # some agents include escaped quotes referrer = line.split('"')[-4] if len(referrer) > 1 and referrer.find(thisdomain) == -1: # count queries querydict = cgi.parse_qs(referrer.split("?")[-1]) if referrer.count(".yahoo."): q = querydict.get("p") else: q = querydict.get("q") if q: q = q[0].lower() queries[q] = queries.get(q,0) + 1 referrers.append([referrer, q]) t1 = time.time() overview["timing"] = int((t1 - t0) * 1000) overview["logfile"] = logfile overview["timeoffirsthit"] = timeoffirsthit overview["hits"] = hits overview["lastrequest"] = lastres overview["pagecount"] = pagecount overview["referrers"] = referrers overview["queries"] = queries hourssince, minutessince = timeSinceApacheDate(timeoffirsthit) pagehitsperhour = pagecount / (hourssince + ( float(minutessince) / 60 )) overview["hourssince"], overview["minutessince"] = hourssince, minutessince overview["pagehitsperhour"] = int(round(pagehitsperhour)) return overview def displayOverviewHTML(overview): print """
Summary
First hit counted %(hourssince)s hours, %(minutessince)s minutes ago
Total hits: %(totalhits)s
Page hits: %(pagecount)s (%(pagehitsperhour)s per hour)
Last page request: %(lastrequest)s details
Processing time: %(timing)s ms
Recent popular pages (%s or more requests)
""" % minresults
hits = overview["hits"]
for res in sortByValue(hits):
score = hits[res]
if score >= minresults:
print """%s:
%s
""" % (res, overview["cgiloc"], urllib.quote(res), score)
print """
%s recent referrers
""" % recentreferrers
referrers = overview["referrers"]
referrers.reverse()
for referrer, query in referrers[0:recentreferrers]:
referrer = referrer.replace("&","&")
print """%s""" % (referrer, referrer, justdomain(referrer))
if query: print " - %s" % query
print "
"
print "
%s recent popular search terms
""" % recentsearches
queries = overview["queries"]
for query in sortByValue(queries)[0:recentsearches]:
query_score = queries[query]
quoted_query = query.replace('"','%22')
print """%(query)s:
%(query_score)s
""" % vars()
print "
Requests for %s
""" % url
counter = 1
for line in loglines:
resource = line.split(' ')[6]
if resource == url and not ignorelines(line):
time = line.split(' ')[3].replace('[','')
if time.startswith(apachetoday): time = time.replace(apachetoday +':','today, ')
ip = line.split(' ')[0]
addr = getDNS(ip)
print """%(counter)s: %(time)s: %(addr)s
""" % vars()
counter = counter + 1
print "
Visit details for %s
hostname: %s
""" % (ip, addr)
counter = 1; pagecounter = 1
for line in loglines:
address = line.split(' ')[0]
if address == ip:
time = line.split(' ')[3].replace('[','')
if time.startswith(apachetoday): time = time.replace(apachetoday +':','today, ')
resource = line.split(' ')[6]
if counter == 1:
referrer = line.split('"')[-4]
user_agent = line.split('"')[-2]
if len(user_agent) > 50: user_agent = user_agent[0:50].strip() + "..."
if len(referrer) > 1:
print """referrer: %(referrer)s
""" % vars()
print """browser: %(user_agent)s
""" % vars()
if ispage(resource):
quotedresource = urllib.quote(resource)
print """%(pagecounter)s: %(time)s: %(resource)s [details]
""" % vars()
pagecounter += 1
counter += 1
print "
peastat %s © tom dyson 2005 // updates, bugs, suggestions