104 lines
2.0 KiB
Perl
Executable File
104 lines
2.0 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
use 5.010;
|
|
use open qw(:locale);
|
|
use strict;
|
|
use utf8;
|
|
use warnings qw(all);
|
|
|
|
use Mojo::UserAgent;
|
|
|
|
# FIFO queue
|
|
my @urls = map { Mojo::URL->new($_) } qw(
|
|
http://sysd.org/page/1/
|
|
http://sysd.org/page/2/
|
|
http://sysd.org/page/3/
|
|
);
|
|
|
|
# Limit parallel connections to 4
|
|
my $max_conn = 4;
|
|
|
|
# User agent following up to 5 redirects
|
|
my $ua = Mojo::UserAgent
|
|
->new(max_redirects => 5)
|
|
->detect_proxy;
|
|
|
|
# Keep track of active connections
|
|
my $active = 0;
|
|
|
|
Mojo::IOLoop->recurring(
|
|
0 => sub {
|
|
for ($active + 1 .. $max_conn) {
|
|
|
|
# Dequeue or halt if there are no active crawlers anymore
|
|
return ($active or Mojo::IOLoop->stop)
|
|
unless my $url = shift @urls;
|
|
|
|
# Fetch non-blocking just by adding
|
|
# a callback and marking as active
|
|
++$active;
|
|
$ua->get($url => \&get_callback);
|
|
}
|
|
}
|
|
);
|
|
|
|
# Start event loop if necessary
|
|
Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
|
|
|
|
sub get_callback {
|
|
my (undef, $tx) = @_;
|
|
|
|
# Deactivate
|
|
--$active;
|
|
|
|
# Parse only OK HTML responses
|
|
return
|
|
if not $tx->res->is_status_class(200)
|
|
or $tx->res->headers->content_type !~ m{^text/html\b}ix;
|
|
|
|
# Request URL
|
|
my $url = $tx->req->url;
|
|
|
|
say $url;
|
|
parse_html($url, $tx);
|
|
|
|
return;
|
|
}
|
|
|
|
sub parse_html {
|
|
my ($url, $tx) = @_;
|
|
|
|
say $tx->res->dom->at('html title')->text;
|
|
|
|
# Extract and enqueue URLs
|
|
for my $e ($tx->res->dom('a[href]')->each) {
|
|
|
|
# Validate href attribute
|
|
my $link = Mojo::URL->new($e->{href});
|
|
next if 'Mojo::URL' ne ref $link;
|
|
|
|
# "normalize" link
|
|
$link = $link->to_abs($tx->req->url)->fragment(undef);
|
|
next unless grep { $link->protocol eq $_ } qw(http https);
|
|
|
|
# Don't go deeper than /a/b/c
|
|
next if @{$link->path->parts} > 3;
|
|
|
|
# Access every link only once
|
|
state $uniq = {};
|
|
++$uniq->{$url->to_string};
|
|
next if ++$uniq->{$link->to_string} > 1;
|
|
|
|
# Don't visit other hosts
|
|
next if $link->host ne $url->host;
|
|
|
|
push @urls, $link;
|
|
say " -> $link";
|
|
}
|
|
say '';
|
|
|
|
return;
|
|
}
|
|
__DATA__
|
|
Featured at:
|
|
http://blogs.perl.org/users/stas/2013/01/web-scraping-with-modern-perl-part-1.html
|