http://paperlined.org/dev/src/pl/apache/parse_logs/ParseLog.pm
package ParseLog;
# TODO:
# - debug the binary search code below
use strict;
use warnings;
use lib '/home/interiot/cpan/lib';
use Date::Parse;
use CGI::Util;
use Search::Binary;
use List::Util qw[min];
################################################### parseCombinedLine ###########################################################
# thanks to http://www.fourmilab.ch/fourmilog/archives/2005-03/000500.html
my $pat_ip_address = qr/(\d{1,3} \.
\d{1,3} \.
\d{1,3} \.
\d{1,3})/x;
my $pat_quoted_field = qr/"((?:(?:(?:(?: # It can be...
[^"\\])* | # ...zero or more characters not quote or backslash...
(?:\\x[0-9a-fA-F][0-9a-fA-F])* | # ...a backslash quoted hexadecimal character...
(?:\\.*) # ...or a backslash escape.
))*))"/x;
my $parse_combined = qr/^ # Start at the beginning
$pat_ip_address \s+ # IP address
(\S+) \s+ # Ident
(\S+) \s+ # Userid
\[([^\]]*)\] \s+ # Date and time
$pat_quoted_field \s+ # Request
(\d+) \s+ # Status
(\-|[\d]+) \s+ # Length of reply or "-"
$pat_quoted_field \s+ # Referer
$pat_quoted_field # User agent
$ # End at the end
/x;
my @combined_fields = qw[
ipaddr
ident
userid
textdate
request
status
replylen
referer
useragent
];
# reads and parses a combined log file line; returns undef on EOF
sub parseCombined {
my $fh = shift;
defined(my $str = <$fh>) or return;
return parseCombinedLine($str, @_);
}
# parse a specific line that's been read in
sub parseCombinedLine {
(my $str = shift) =~ s/\s+$//si;
my $hostname = shift;
my %ret;
@ret{@combined_fields} = ($str =~ $parse_combined);
foreach my $f (qw[ request referer useragent ])
{$ret{$f} =~ s/\\(.)/$1/sg}
@ret{qw[method url httpver]} = split(' ', delete $ret{request});
$ret{date} = Date::Parse::str2time($ret{textdate});
$ret{line} = $str;
$ret{url} = "http://$hostname" . $ret{url};
return \%ret;
}
################################################### locateDateFromEnd ###########################################################
# Given a specific time (ether a normal perl time, or the standard cookie expiration format, eg. "3m" is three minuets ago, "2y" is two years ago),
# this finds the spot in the log file where the first record on or after that time is.
#
# Intended to be used to find lines relative to the end of the file... if looking for lines near the beginning of a file, something based on Tie::File might be
# more useful, since it remembers the starting position of all lines up to the max it's read so far.
sub locateEndBLOCKSIZE {64*1024}
sub locateDateFromEnd {
my $fh = shift;
my $time_sought = shift;
my $lineParser = shift; # any function that... inputs: one string, comprising one log file line... outputs: a hash-ref of fields, one of which should be "date"
if ($time_sought !~ /^\d+$/) {
$time_sought = CGI::Util::expire_calc("-" . $time_sought);
}
my $block_cache = {};
my $pos = binary_search(0, (-s $fh)-1, $time_sought, \&locateEndRead,
{block_cache => {},
line_parser => $lineParser,
fh => $fh,
});
seek($fh, $pos, 0);
}
sub locateEndRead {
my ($handle, $time_sought, $pos) = @_;
my ($block_cache, $lineParser, $fh) = @$handle->{qw[block_cache line_parser fh]};
# remove all blocks that aren't within 2*BLOCKSIZE of the current position
foreach my $p (keys %$block_cache) {
if (abs($p - $pos) > 2*locateEndBLOCKSIZE()) {
delete $block_cache->{$p};
}
}
# construct the line we needed
my $blockpos_before = ($pos-1) % locateEndBLOCKSIZE(); # -1 because we want to see the character right before this string, to see if it's a newline or not
my $blockpos_after = $blockpos_before + locateEndBLOCKSIZE();
$blockpos_before = 0 if ($blockpos_before < 0); # the very first string in the file doesn't have to / can't have a newline before it
my $blockstr_before = ($block_cache->{$blockpos_before} ||= readblock($fh, $blockpos_before, locateEndBLOCKSIZE()));
my $first_half = substr($blockstr_before, $pos-$blockpos_before);
# If the position that Search::Binary gave us isn't the start of a whole record, then we need to move to the start of the next full record
if ($pos > 0) {
my $charbefore = substr($blockstr_before, $pos-$blockpos_before-1, 1);
if (!($charbefore =~ /^\x0A$/s || ($charbefore =~ /^\x0D$/s && $first_half !~ /^\x0A/s))) {
$blockstr_before =~ s/^(.*?(?:\x0D\x0A|\x0D|\x0A))//s;
$pos += length($1); # we need to tell Search::Binary where the whole record actually started
}
}
# If the block we're reading doesn't contain an entire record, we need to look at the next block as well
if ($first_half !~ s/[\n\r].*//si) {
my $blockstr_after = ($block_cache->{$blockpos_after} ||= readblock($fh, $blockpos_after, locateEndBLOCKSIZE()));
$first_half .= $blockstr_after || '';
if ($first_half !~ s/[\n\r].*//si) {
# TODO: unfortunately, if this is the last line in the file, it will fall into this branch... add a check in case we're at the very end
die "Unable to find end of string."; # if lines are too long, we're not going to deal with that here... just make the block size bigger than any line
}
}
my $fields = $lineParser->($first_half);
return ($fields->{date} <=> $time_sought, $pos);
}
sub readblock {
my $fh = shift;
my $offset = shift;
my $len = shift;
seek($fh, $offset, 0);
my $in;
read($fh, $in, $len);
return $in;
}
################################################### Classification ##############################################################
sub is_search_spider {
my $rec = shift;
return 1 if ($rec->{useragent} =~ m{
\bcrawler\@dotnetdotcom\.org|
\bYoudaoBot|
\bYahoo!.Slurp|
\bgoogle.com/bot.html|
\bbaidu.com/search/spider.htm|
\bbaidu.com/search/|
\bask.com/en/docs/about|
\bsearch.msn.com/msnbot.htm|
\bsitesell.com/sbider.html|
\bhelp.naver.com/delete_main.asp|
\bVoilaBot|
\bresearch.microsoft.com/research/sv/msrbot/|
\bglobalspec.com/Ocelli|
\bsearchme.com/support/|
\bbecome.com/site_owners.html|
\byodao.com/help/webmaster/spider/|
\bexabot.com/go/robot|
\bvoyager-hc| # http://www.kosmix.com/html/crawler.html
\bdiscoveryengine.com/discobot.html|
\bbing.com/bingbot.htm|
\byandex.com/bots|
\bscoutjet.com
}ix);
}
# Wow, there are some serious referer spammers...
#
# eg. grep for credit-dreams.com in the logs, for example. (they DON'T stick to just this
# domain, but it's one example). They're changing every possible way to detect this stuff...
# UA string, source IP, destination domain...
#
# So, referer spam may very well require something on the order of SpamAssassin to tackle properly.
sub is_backlink_spammer {
my $rec = shift;
return 1 if ($rec->{referer} =~ m{\b(?:
yourtrafficpackage\.com|
phentermine-1-pills\.com|
myspace\.com/escortgirlslondon|
3marketeersproductions\.com|
ablejobs\.com
)\b}xi);
}
# If it's probably directly from me
sub is_me {
my $rec = shift;
return 1 if ($rec->{ipaddr} eq '76.16.92.56');
return 1 if ($rec->{useragent} =~ /interiot/i);
}
BEGIN {
my %friend_static_ips = map {$_,1} (
"136.182.158.153", # motorola - Winsor usually
"136.182.158.137",
"136.182.158.145",
"136.182.158.129",
"129.188.69.161",
"129.188.69.169",
);
sub is_friends_static_ip {
my $rec = shift;
return 1 if ($friend_static_ips{$rec->{ipaddr}});
}
}
BEGIN {
my %individual_rss_reader = map {$_,1} (
"204.9.177.18", # LiveJournal, but reports it's for only one user
);
sub is_individual_rss_reader {
my $rec = shift;
return 1 if ($individual_rss_reader{$rec->{ipaddr}});
}
}
# either an RSS reader used by many people, or any other web-based tool that many people can access
sub is_public_automated_tool {
my $rec = shift;
return 1 if ($rec->{useragent} =~ m#
urltrends.com|
coralcdn.org|
aiderss.com
#ix);
}
BEGIN {
my %really_popular_pages = map {$_,1} (
"http://paperlined.org/apps/porn/.web2.0.html",
"http://paperlined.org/photoblog/2007_10_28/",
);
sub humans_really_like_this_page {
my $rec = shift;
return 1 if ($really_popular_pages{$rec->{url}});
}
}
# if it's content that's typically PART of a page on wikipedia, rather than being a URL that people
# navigate directly to (of course, we can do referer-checking to weed out most of these, but a few
# people have the referer turned off)
sub is_embedded_content {
my $rec = shift;
return 1 if ($rec->{url} =~ /\.(?:jpe?g|gif|png)$/si);
}
# a file that's frequently requested but rarely meaningful (should this be merged in with
# is_embedded_content()?)
sub is_stupid_file {
my $rec = shift;
return 1 if ($rec->{url} =~ m#/robots\.txt|favicon\.ico#si);
}
################################################### classifying referers ###########################################################
#sub is
1;
Generated by GNU enscript 1.6.4.