spiderhks-36-01.txt 2.41 KB
Newer Older
O'Reilly Media, Inc.'s avatar
O'Reilly Media, Inc. committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/perl -w
use strict;
use WWW::Mechanize;
use Getopt::Long;

my $max = 10;
GetOptions(
    "max=i" => \$max,
);

my $search = shift or die "Must specify a search term";

my $w = WWW::Mechanize->new;
$w->get( "http://www.webshots.com/explore/" );
$w->success or die "Can't read the search page!\n";

$w->submit_form(
    form_number => 1,
    fields => { words => $search },
);
$w->success or die "Search failed!\n";

# execution of script stops if warning
# about adult content is returned.
if ( $w->content =~ /Adult content/i ) {
    die "Search term probably returns adult content\n";
}

my $ndownloads = 0;
NEXT_PAGE_LOOP: while(1) {
    $w->content =~ /Page (\d+) of (\d+)/
       or warn "Can't find page count\n";
    warn "On page $1 of $2...\n";

    # Pull the "Next" link off before we download pictures
    my $nextlink = $w->find_link( text => "Next >" );
    my $currpage = $w->uri;

    my @links = $w->find_all_links( url_regex =>
        qr[http://community.webshots.com/photo/] );

    for my $link ( @links ) {
       my $url = $link->url;
       my $text = $link->text;
       next if $text eq "[IMG]";

       $w->get( $url );
       $w->success or die "Couldn't fetch $url";

       if ($w->content=~m[(http://community\.webshots\.com/.+?\.(jpg|gif|png))]) {
           my $imgurl = $1; my $type = $2;

           # Make a name based on the webshots title for the pic
           my $filename = lc $text;        # Lowercase everything
           $filename =~ s/\s+/-/g;         # Spaces become dashes
           $filename =~ s/[^0-9a-z-]+//g;  # Strip all nonalphanumeric
           $filename =~ s/(^-|-$)//;       # Strip leading/trailing dashes
           $filename = "$filename.$type";

           # Bring down the image if we don't already have it
           if ( -e $filename ) { warn "Already have $filename\n"; }
           else {
               # use LWP's :content_file to save our
               # image directly to the filesystem,
               # instead of processing it ourselves.
               warn "Saving $filename...\n";
               $w->get( $imgurl, ":content_file"=>$filename );
               ++$ndownloads; last if $ndownloads >= $max;
           }
        } else { warn "Couldn't find an image on $url\n"; }
    }

    last unless $nextlink && ($ndownloads<$max);

    my $nexturl = URI->new_abs( $nextlink->url, $currpage )->as_string;
    $w->get( $nexturl ); die "$nexturl failed!\n" unless $w->success;
}