1 分析日志的python框架awk.py
- #
- # Custom awk.py module
- #
-
-
- class controller:
-
- def __init__(self, f):
- self.m_file = f
- self.m_handlers = []
-
-
- def subscribe(self, o):
- self.m_handlers.append(o)
-
- def run(self):
-
- for o in self.m_handlers:
- o.begin()
-
- s = self.m_file.readline()
-
- while s != "":
-
- for o in self.m_handlers:
- o.process_line(s)
-
- s = self.m_file.readline()
-
-
- for o in self.m_handlers:
- o.end()
-
-
- def print_results(self):
-
- print
- print "Results:"
- print
-
- for o in self.m_handlers:
- print "------------------------------------------------------"
- print o.description()
- print "------------------------------------------------------"
- print o.result()
统计日志的点击量count_line.py
- # Standard sys module
- import sys
-
- # Custom awk.py module
- import awk
-
- class count_lines:
-
- def begin(self):
- self.m_count = 0
-
- def process_line(self, s):
- self.m_count += 1
-
- def end(self):
- pass
-
- def description(self):
- return "# of lines in the file"
-
- def result(self):
- return self.m_count
-
-
- #
- # Step 1: Create the Awk controller
- #
- ac = awk.controller(sys.stdin)
-
- #
- # Step 2: Subscribe the handler
- #
- ac.subscribe(count_lines())
-
- #
- # Step 3: Run
- #
- ac.run()
-
- #
- # Step 4: Print the results
- #
- ac.print_results()
使用方法是shell中执行# cat apachelog.log|python count_lines.py
统计浏览次数超过n次的访问者 visitors.pyHow many people have returned to the site more than N times?
- import re;
- import sys
- imort awk
-
- class return_visitors:
-
- def __init__(self, n):
- self.m_n = n;
- self.m_ip_days = {};
-
- def begin(self):
- pass;
-
- def process_line(self, s):
-
- try:
- array = s.split();
- ip = array[0];
- day = array[3][1:7];
-
- if self.m_ip_days.has_key(ip):
-
- if day not in self.m_ip_days[ip]:
- self.m_ip_days[ip].append(day);
-
- else:
- self.m_ip_days[ip] = [];
- self.m_ip_days[ip].append(day);
-
- except IndexError:
- pass;
-
-
-
- def end(self):
-
- ips = self.m_ip_days.keys();
- count = 0;
-
- for ip in ips:
-
- if len(self.m_ip_days[ip]) > self.m_n:
- count += 1;
-
- self.m_count = count;
-
-
- def description(self):
- return "# of IP addresses that visited more than %s days" % self.m_n;
-
- def result(self):
- return self.m_count;
- ac = awk.controller(sys.stdin)
- ac.subscribe(return_visitors(2))
- ac.run()
- ac.print_results()
# cat apachelog.log|python visitors.py按照域名统计访问量domain.py
- import re;
- import sys
- imort awk
-
- class referring_domains:
-
- def __init__(self):
- self.m_domains = {};
-
- def begin(self):
- pass;
-
- def process_line(self, line):
-
- try:
- array = line.split();
- referrer = array[10];
-
- m = re.search("//[a-zA-Z0-9-.]*.[a-zA-z]{2,3}/",
- referrer);
-
- lenlength = len(m.group(0));
- domain = m.group(0)[2:length-1];
-
- if self.m_domains.has_key(domain):
- self.m_domains[domain] += 1;
- else:
- self.m_domains[domain] = 1;
-
- except AttributeError:
- pass;
- except IndexError:
- pass;
-
-
- def end(self):
- pass;
-
-
- def description(self):
- return "Referring domains";
-
-
- def sort(self, key1, key2):
- if self.m_domains[key1] > self.m_domains[key2]:
- return -1;
- elif self.m_domains[key1] == self.m_domains[key2]:
- return 0;
- else:
- return 1;
-
-
- def result(self):
-
- s = "";
- keys = self.m_domains.keys();
- keys.sort(self.sort);
-
- for domain in keys:
- s += domain;
- s += " ";
- s += str(self.m_domains[domain]);
- s += "
";
-
- s += "
";
-
- return s;
- ac = awk.controller(sys.stdin)
- ac.subscribe(referring_domains())
- ac.run()
- ac.print_results()
# cat apachelog.log|python domain.py