#!/usr/bin/env perl use 5.016; use common::sense; use utf8::all; use Data::Dumper; # Use fast binary libraries use EV; use Web::Scraper::LibXML; use YADA 0.039; use Scalar::Util qw( blessed ); YADA->new( common_opts => { # Available opts @ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html encoding => '', followlocation => 1, maxredirs => 5, }, http_response => 1, max => 4, )->append([qw[ http://foo.qa/forum.root ]] => sub { my ($self) = @_; return if $self->has_error or not $self->response->is_success or not $self->response->content_is_html; # Declare the scraper once and then reuse it state $scraper = scraper { process q(//*[@id="thread-1090300"]/div[2]/div/h3/a[1]), q(link) => q(@href) # process q(//li[contains(@class, "discussionListItem")]), q(threads[]) => { # threadId => q(@id), # q(data[]) => { # author => q(@data-author), # q(shit) => scraper{ # process q(.main > div > h3 > a), q(links[]) => q(@href); # process q(.main > div > div > div > span >a > abbr), q(date) => '@data-datestring'; #Any threads within 1 week give relative day of the week, actual is in CSS # process q(.main > div > div > div > span >a > span), q(date) => 'text'; #Any older threads use actual date # process q(.listBlock > .minor > dd ), q(views) => q(TEXT); # } # } # }; }; # Employ amazing Perl (en|de)coding powers to handle HTML charsets my @doc = $scraper->scrape( $self->response->decoded_content, $self->final_url, ); #print "$_\n" for @{$doc->{threads} // []}; print Data::Dumper->Dump( \@doc ); # Enqueue links from the parsed page # $self->queue->prepend([ # grep { # $_->can(q(host)) and $_->scheme =~ m{^https?$}x # and $_->host eq $self->initial_url->host # and (grep { length } $_->path_segments) <= 3 # } @{$doc->{links} // []}- # ] => __SUB__); })->wait;