http://paperlined.org/dev/src/pl/ahk/fetch_post_chunks.pl
#!/usr/bin/perl
# ======== oops ====
# Wrote 0 chunks from page 434. Unable to parse 'next' pointer from:
#
#
# Your IP address is making an unusually high number of simultaneous connections to the server.
# After a short time, try a refresh/reload on this page.
use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request::Common;
use Data::Dumper;
my ($start_on_page) = @ARGV;
my $start_on_post;
if ($start_on_page) {
$start_on_post = $start_on_page * 15;
}
my $ua = LWP::UserAgent->new();
my $res = $ua->request(POST 'http://www.autohotkey.com/forum/search.php?mode=results',
[search_keywords => "",
search_terms => "all",
search_author => "Chris",
search_forum => "-1",
search_time => "0",
search_fields => "all",
show_results => "posts",
return_chars => "1000",
sort_by => "0",
sort_dir => "DESC"]);
my $page = 0;
while (1) {
parse_comments($res->content, ++$page);
my ($next) = ($res->content =~ m#<a href="(search\.php\?search_id=[^"'>]*)">Next</a>#is);
last unless $next;
$next =~ s/&/&/g;
$next = "http://www.autohotkey.com/forum/$next";
if ($start_on_post) {
$next =~ s/&start=\d+&/&start=$start_on_post&/s;
$start_on_post = undef;
$page = ($start_on_page - 1);
}
print "$next\n";
$res = $ua->request(GET $next);
sleep(1.0);
}
print "Unable to parse 'next' pointer from:\n\n\n", $res->content;
sub parse_comments {
my ($html, $page) = @_;
#print $html;
#exit if ($page >= 2);
# split into distinct posts
my @post_chunks = split /<td class="catHead" [^>]*>/, $html;
$post_chunks[-1] =~ s/<td class="catBottom".*//si;
shift(@post_chunks);
foreach my $chunk (@post_chunks) {
#my ($id) = ($chunk =~ /highlight=#(\d+)/i);
my ($id) = ($chunk =~ /Subject: <b><a href="[^"'> ]*#(\d+)/si);
if (!$id) {
print "Unable to identify ID in:\n\n\n$chunk\n"; exit;
}
open FOUT, ">chunks/$id" or die $!;
print FOUT $chunk;
close FOUT;
}
printf "Wrote %d chunks from page %d.\t", scalar(@post_chunks), $page;
}
Generated by GNU enscript 1.6.4.