66 lines
2.2 KiB
Perl
Executable File
66 lines
2.2 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
use 5.016;
|
|
use common::sense;
|
|
use utf8::all;
|
|
use Data::Dumper;
|
|
|
|
# Use fast binary libraries
|
|
use EV;
|
|
use Web::Scraper::LibXML;
|
|
use YADA 0.039;
|
|
use Scalar::Util qw( blessed );
|
|
|
|
YADA->new(
|
|
common_opts => {
|
|
# Available opts @ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html
|
|
encoding => '',
|
|
followlocation => 1,
|
|
maxredirs => 5,
|
|
}, http_response => 1, max => 4,
|
|
)->append([qw[
|
|
http://foo.qa/forum.root
|
|
]] => sub {
|
|
my ($self) = @_;
|
|
return if $self->has_error
|
|
or not $self->response->is_success
|
|
or not $self->response->content_is_html;
|
|
|
|
# Declare the scraper once and then reuse it
|
|
state $scraper = scraper {
|
|
process q(//*[@id="thread-1090300"]/div[2]/div/h3/a[1]), q(link) => q(@href)
|
|
# process q(//li[contains(@class, "discussionListItem")]), q(threads[]) => {
|
|
# threadId => q(@id),
|
|
# q(data[]) => {
|
|
# author => q(@data-author),
|
|
# q(shit) => scraper{
|
|
# process q(.main > div > h3 > a), q(links[]) => q(@href);
|
|
# process q(.main > div > div > div > span >a > abbr), q(date) => '@data-datestring'; #Any threads within 1 week give relative day of the week, actual is in CSS
|
|
# process q(.main > div > div > div > span >a > span), q(date) => 'text'; #Any older threads use actual date
|
|
# process q(.listBlock > .minor > dd ), q(views) => q(TEXT);
|
|
# }
|
|
# }
|
|
# };
|
|
};
|
|
|
|
|
|
|
|
|
|
# Employ amazing Perl (en|de)coding powers to handle HTML charsets
|
|
my @doc = $scraper->scrape(
|
|
$self->response->decoded_content,
|
|
$self->final_url,
|
|
);
|
|
|
|
|
|
#print "$_\n" for @{$doc->{threads} // []};
|
|
print Data::Dumper->Dump( \@doc );
|
|
|
|
# Enqueue links from the parsed page
|
|
# $self->queue->prepend([
|
|
# grep {
|
|
# $_->can(q(host)) and $_->scheme =~ m{^https?$}x
|
|
# and $_->host eq $self->initial_url->host
|
|
# and (grep { length } $_->path_segments) <= 3
|
|
# } @{$doc->{links} // []}-
|
|
# ] => __SUB__);
|
|
})->wait; |