http://paperlined.org/dev/src/pl/apache/parse_logs/ParseLog.pm

package ParseLog;

# TODO:
#   - debug the binary search code below

    use strict;
    use warnings;

    use lib '/home/interiot/cpan/lib';
    use Date::Parse;
    use CGI::Util;
    use Search::Binary;
    use List::Util qw[min];

################################################### parseCombinedLine ###########################################################

# thanks to http://www.fourmilab.ch/fourmilog/archives/2005-03/000500.html
my $pat_ip_address = qr/(\d{1,3} \.
    \d{1,3} \.
    \d{1,3} \.
    \d{1,3})/x;
my $pat_quoted_field = qr/"((?:(?:(?:(?:    # It can be...
    [^"\\])* |  # ...zero or more characters not quote or backslash...
    (?:\\x[0-9a-fA-F][0-9a-fA-F])* | # ...a backslash quoted hexadecimal character...
    (?:\\.*)                         # ...or a backslash escape.
   ))*))"/x;
my $parse_combined = qr/^       # Start at the beginning
     $pat_ip_address \s+        # IP address
     (\S+) \s+                  # Ident
     (\S+) \s+                  # Userid
     \[([^\]]*)\] \s+           # Date and time
     $pat_quoted_field \s+      # Request
     (\d+) \s+                  # Status
     (\-|[\d]+) \s+             # Length of reply or "-"
     $pat_quoted_field \s+      # Referer
     $pat_quoted_field          # User agent
     $                          # End at the end
   /x;

my @combined_fields = qw[
    ipaddr
    ident
    userid
    textdate
    request
    status
    replylen
    referer
    useragent
];

# reads and parses a combined log file line; returns undef on EOF
sub parseCombined {
    my $fh = shift;
    defined(my $str = <$fh>) or return;
    return parseCombinedLine($str, @_);
}

# parse a specific line that's been read in
sub parseCombinedLine {
    (my $str = shift) =~ s/\s+$//si;
    my $hostname = shift;
    my %ret;
    @ret{@combined_fields} = ($str =~ $parse_combined);
    foreach my $f (qw[ request referer useragent ])
        {$ret{$f} =~ s/\\(.)/$1/sg}
    @ret{qw[method url httpver]} = split(' ', delete $ret{request});
    $ret{date} = Date::Parse::str2time($ret{textdate});
    $ret{line} = $str;
    $ret{url} = "http://$hostname" . $ret{url};
    return \%ret;
}




################################################### locateDateFromEnd ###########################################################

# Given a specific time (ether a normal perl time, or the standard cookie expiration format, eg. "3m" is three minuets ago, "2y" is two years ago), 
# this finds the spot in the log file where the first record on or after that time is.
#
# Intended to be used to find lines relative to the end of the file...  if looking for lines near the beginning of a file, something based on Tie::File might be
# more useful, since it remembers the starting position of all lines up to the max it's read so far.
    sub locateEndBLOCKSIZE {64*1024}
sub locateDateFromEnd {
    my $fh = shift;
    my $time_sought = shift;
    my $lineParser = shift;         # any function that...   inputs: one string, comprising one log file line...     outputs: a hash-ref of fields, one of which should be "date"

    if ($time_sought !~ /^\d+$/) {
        $time_sought = CGI::Util::expire_calc("-" . $time_sought);
    }

    my $block_cache = {};
    my $pos = binary_search(0, (-s $fh)-1, $time_sought, \&locateEndRead,
        {block_cache => {},
         line_parser => $lineParser,
         fh => $fh,
        });

    seek($fh, $pos, 0);
}

    sub locateEndRead {
        my ($handle, $time_sought, $pos) = @_;

        my ($block_cache, $lineParser, $fh) = @$handle->{qw[block_cache line_parser fh]};

        # remove all blocks that aren't within 2*BLOCKSIZE of the current position
        foreach my $p (keys %$block_cache) {
            if (abs($p - $pos) > 2*locateEndBLOCKSIZE()) {
                delete $block_cache->{$p};
            }
        }

        # construct the line we needed
        my $blockpos_before = ($pos-1) % locateEndBLOCKSIZE();             # -1 because we want to see the character right before this string, to see if it's a newline or not
        my $blockpos_after  = $blockpos_before + locateEndBLOCKSIZE();
        $blockpos_before = 0 if ($blockpos_before < 0);                     # the very first string in the file doesn't have to / can't have a newline before it

        my $blockstr_before = ($block_cache->{$blockpos_before} ||= readblock($fh, $blockpos_before, locateEndBLOCKSIZE()));

        my $first_half = substr($blockstr_before, $pos-$blockpos_before);
        # If the position that Search::Binary gave us isn't the start of a whole record, then we need to move to the start of the next full record
        if ($pos > 0) {
            my $charbefore = substr($blockstr_before, $pos-$blockpos_before-1, 1);
            if (!($charbefore =~ /^\x0A$/s || ($charbefore =~ /^\x0D$/s && $first_half !~ /^\x0A/s))) {
                $blockstr_before =~ s/^(.*?(?:\x0D\x0A|\x0D|\x0A))//s;
                $pos += length($1);     # we need to tell Search::Binary where the whole record actually started
            }
        }
        # If the block we're reading doesn't contain an entire record, we need to look at the next block as well
        if ($first_half !~ s/[\n\r].*//si) {
            my $blockstr_after = ($block_cache->{$blockpos_after} ||= readblock($fh, $blockpos_after, locateEndBLOCKSIZE()));
            $first_half .= $blockstr_after || '';
            if ($first_half !~ s/[\n\r].*//si) {
                # TODO: unfortunately, if this is the last line in the file, it will fall into this branch...  add a check in case we're at the very end
                die "Unable to find end of string.";        # if lines are too long, we're not going to deal with that here...  just make the block size bigger than any line
            }
        }
        
        my $fields = $lineParser->($first_half);
        return ($fields->{date} <=> $time_sought, $pos);
    }

    sub readblock {
        my $fh = shift;
        my $offset = shift;
        my $len = shift;

        seek($fh, $offset, 0);
        my $in;
        read($fh, $in, $len);
        return $in;
    }


################################################### Classification ##############################################################



sub is_search_spider {
    my $rec = shift;

    return 1 if ($rec->{useragent} =~ m{
            \bcrawler\@dotnetdotcom\.org|
            \bYoudaoBot|
            \bYahoo!.Slurp|
            \bgoogle.com/bot.html|
            \bbaidu.com/search/spider.htm|
            \bbaidu.com/search/|
            \bask.com/en/docs/about|
            \bsearch.msn.com/msnbot.htm|
            \bsitesell.com/sbider.html|
            \bhelp.naver.com/delete_main.asp|
            \bVoilaBot|
            \bresearch.microsoft.com/research/sv/msrbot/|
            \bglobalspec.com/Ocelli|
            \bsearchme.com/support/|
            \bbecome.com/site_owners.html|
            \byodao.com/help/webmaster/spider/|
            \bexabot.com/go/robot|
            \bvoyager-hc|                     # http://www.kosmix.com/html/crawler.html
            \bdiscoveryengine.com/discobot.html|
            \bbing.com/bingbot.htm|
            \byandex.com/bots|
            \bscoutjet.com
        }ix);
}



# Wow, there are some serious referer spammers...
#
# eg. grep for credit-dreams.com in the logs, for example.   (they DON'T stick to just this
# domain, but it's one example).  They're changing every possible way to detect this stuff...
# UA string, source IP, destination domain...
#
# So, referer spam may very well require something on the order of SpamAssassin to tackle properly.
sub is_backlink_spammer {
    my $rec = shift;

    return 1 if ($rec->{referer} =~ m{\b(?:
                        yourtrafficpackage\.com|
                        phentermine-1-pills\.com|
                        myspace\.com/escortgirlslondon|
                        3marketeersproductions\.com|
                        ablejobs\.com
                        )\b}xi);
}


# If it's probably directly from me
sub is_me {
    my $rec = shift;

    return 1 if ($rec->{ipaddr} eq '76.16.92.56');
    return 1 if ($rec->{useragent} =~ /interiot/i);
}


BEGIN {
    my %friend_static_ips =  map {$_,1} (
        "136.182.158.153",      # motorola - Winsor usually
        "136.182.158.137",
        "136.182.158.145",
        "136.182.158.129",
        "129.188.69.161",
        "129.188.69.169",
    );
sub is_friends_static_ip {
    my $rec = shift;

    return 1 if ($friend_static_ips{$rec->{ipaddr}});
}
}



BEGIN {
    my %individual_rss_reader = map {$_,1} (
        "204.9.177.18",         # LiveJournal, but reports it's for only one user
    );
sub is_individual_rss_reader {
    my $rec = shift;
    return 1 if ($individual_rss_reader{$rec->{ipaddr}});
}
}



# either an RSS reader used by many people, or any other web-based tool that many people can access
sub is_public_automated_tool {
    my $rec = shift;

    return 1 if ($rec->{useragent} =~ m#
            urltrends.com|
            coralcdn.org|
            aiderss.com
        #ix);
}



BEGIN {
    my %really_popular_pages = map {$_,1} (
        "http://paperlined.org/apps/porn/.web2.0.html",
        "http://paperlined.org/photoblog/2007_10_28/",
    );
sub humans_really_like_this_page {
    my $rec = shift;
    return 1 if ($really_popular_pages{$rec->{url}});
}
}



# if it's content that's typically PART of a page on wikipedia, rather than being a URL that people
# navigate directly to  (of course, we can do referer-checking to weed out most of these, but a few
# people have the referer turned off)
sub is_embedded_content {
    my $rec = shift;

    return 1 if ($rec->{url} =~ /\.(?:jpe?g|gif|png)$/si);
}



# a file that's frequently requested but rarely meaningful  (should this be merged in with
# is_embedded_content()?)
sub is_stupid_file {
    my $rec = shift;

    return 1 if ($rec->{url} =~ m#/robots\.txt|favicon\.ico#si);
}



################################################### classifying referers ###########################################################

#sub is 

1;

Generated by GNU enscript 1.6.4.