Commit 8b685cdb by O'Reilly Media, Inc.

Initial commit

parents
9780596005771
\ No newline at end of file
## Example files for the title:
# Spidering Hacks, by Tara Calishain
[![Spidering Hacks, by Tara Calishain](http://akamaicovers.oreilly.com/images/9780596005771/cat.gif)](https://www.safaribooksonline.com/)
The following applies to example files from material published by O’Reilly Media, Inc. Content from other publishers may include different rules of usage. Please refer to any additional usage rights explained in the actual example files or refer to the publisher’s website.
O'Reilly books are here to help you get your job done. In general, you may use the code in O'Reilly books in your programs and documentation. You do not need to contact us for permission unless you're reproducing a significant portion of the code. For example, writing a program that uses several chunks of code from our books does not require permission. Answering a question by citing our books and quoting example code does not require permission. On the other hand, selling or distributing a CD-ROM of examples from O'Reilly books does require permission. Incorporating a significant amount of example code from our books into your product's documentation does require permission.
We appreciate, but do not require, attribution. An attribution usually includes the title, author, publisher, and ISBN.
If you think your use of code examples falls outside fair use or the permission given here, feel free to contact us at <permissions@oreilly.com>.
Please note that the examples are not production code and have not been carefully testing. They are provided "as-is" and come with no warranty of any kind.
logo.png

3.09 KB

#!/usr/bin/perl -w
#
# Progress Bar: Dots - Simple example of an LWP progress bar.
# http://disobey.com/d/code/ or contact morbus@disobey.com.
#
# This code is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
use strict; $|++;
my $VERSION = "1.0";
# make sure we have the modules we need, else die peacefully.
eval("use LWP 5.6.9;"); die "[err] LWP 5.6.9 or greater required.\n" if $@;
# now, check for passed URLs for downloading.
die "[err] No URLs were passed for processing.\n" unless @ARGV;
# our downloaded data.
my $final_data = undef;
# loop through each URL.
foreach my $url (@ARGV) {
print "Downloading URL at ", substr($url, 0, 40), "... ";
# create a new useragent and download the actual URL.
# all the data gets thrown into $final_data, which
# the callback subroutine appends to.
my $ua = LWP::UserAgent->new( );
my $response = $ua->get($url, ':content_cb' => \&callback, );
print "\n"; # after the final dot from downloading.
}
# per chunk.
sub callback {
my ($data, $response, $protocol) = @_;
$final_data .= $data;
print ".";
}
\ No newline at end of file
#!/usr/bin/perl -w
#
# Progress Bar: Wget - Wget style progress bar with LWP.
# http://disobey.com/d/code/ or contact morbus@disobey.com.
# Original routine by tachyon at http://tachyon.perlmonk.org/
#
# This code is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
use strict; $|++;
my $VERSION = "1.0";
# make sure we have the modules we need, else die peacefully.
eval("use LWP 5.6.9;"); die "[err] LWP 5.6.9 or greater required.\n" if $@;
# now, check for passed URLs for downloading.
die "[err] No URLs were passed for processing.\n" unless @ARGV;
# happy golucky variables.
my $final_data; # our downloaded data.
my $total_size; # total size of the URL.
# loop through each URL.
foreach my $url (@ARGV) {
print "Downloading URL at ", substr($url, 0, 40), "...\n";
# create a new useragent and download the actual URL.
# all the data gets thrown into $final_data, which
# the callback subroutine appends to. before that,
# though, get the total size of the URL in question.
my $ua = LWP::UserAgent->new( );
my $result = $ua->head($url);
my $remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
# now do the downloading.
my $response = $ua->get($url, ':content_cb' => \&callback );
}
# per chunk.
sub callback {
my ($data, $response, $protocol) = @_;
$final_data .= $data;
print progress_bar( length($final_data), $total_size, 25, '=' );
}
# wget-style. routine by tachyon
# at http://tachyon.perlmonk.org/
sub progress_bar {
my ( $got, $total, $width, $char ) = @_;
$width ||= 25; $char ||= '=';
my $num_width = length $total;
sprintf "|%-${width}s| Got %${num_width}s bytes of %s (%.2f%%)\r",
$char x (($width-1)*$got/$total). '>',
$got, $total, 100*$got/+$total;
}
\ No newline at end of file
#!/usr/bin/perl -w
#
# Progress Bar: Term::ProgressBar - progress bar with LWP.
# http://disobey.com/d/code/ or contact morbus@disobey.com.
# Original routine by tachyon at http://tachyon.perlmonk.org/
#
# This code is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
use strict; $|++;
my $VERSION = "1.0";
# make sure we have the modules we need, else die peacefully.
eval("use LWP 5.6.9;");
die "[err] LWP is not the required version.\n" if $@;
eval("use Term::ProgressBar;"); # prevent word-wrapping.
die "[err] Term::ProgressBar not installed.\n" if $@;
# now, check for passed URLs for downloading.
die "[err] No URLs were passed for processing.\n" unless @ARGV;
# happy golucky variables.
my $final_data = 0; # our downloaded data.
my $total_size; # total size of the URL.
my $progress; # progress bar object.
my $next_update = 0; # reduce ProgressBar use.
# loop through each URL.
foreach my $url (@ARGV) {
print "Downloading URL at ", substr($url, 0, 40), "...\n";
# create a new useragent and download the actual URL.
# all the data gets thrown into $final_data, which
# the callback subroutine appends to. before that,
# though, get the total size of the URL in question.
my $ua = LWP::UserAgent->new( );
my $result = $ua->head($url);
my $remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
# initialize our progress bar.
$progress = Term::ProgressBar->new({count => $total_size, ETA => 'linear'});
$progress->minor(0); # turns off the floating asterisks.
$progress->max_update_rate(1); # only relevant when ETA is used.
# now do the downloading.
my $response = $ua->get($url, ':content_cb' => \&callback );
# top off the progress bar.
$progress->update($total_size);
}
# per chunk.
sub callback {
my ($data, $response, $protocol) = @_;
$final_data .= $data;
# reduce usage, as per example 3 in POD.
$next_update = $progress->update(length($final_data))
if length($final_data) >= $next_update;
}
\ No newline at end of file
#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use HTML::TokeParser;
use URI::Escape;
# The artist to search for should be given as an argument.
my $artist = $ARGV[0]; die "No artist specified" unless $artist ne '';
# We use URI::Escape to convert the artist's name
# into a form that can be encoded as part of a URL.
my $search = uri_escape($artist);
# 1. Download the page to be worked on.
#####################################
my $content =
get('http://www.echocloud.net/index.php?searchword='.
"$search".'&option=asearch&stype=2&order=0&nrows=6');
# Now that we have our content, initialize a
# new HTML::TokeParser object with it.
my $stream = new HTML::TokeParser(\$content);
print "Artists liked by $artist listeners include:\n";
# 2. Determine the structure of the HTML document.
# An HTML result looks like: <a href='index.php?searchword
# =Beck&option=asearch' class="cf">&nbsp;Beck</a>
#####################################
# 3. Model the structure in code.
# Given that each <a class="cf"> contains our result, we:
# - Search for each <a> tag.
# - If it has a 'class' attribute, and
# the class attribute is "cf":
# - Save all the text from <a> to </a>.
# - Repeat.
#
# Of the methods used below, the two from TokeParser are:
# get_tag: Move the stream to the next occurence of a tag.
# get_trimmed_text: Store text from the current location
# of the stream to the tag given.
#####################################
# For each <a> tag
while (my $tag = $stream->get_tag("a")) {
# Is there a 'class' attribute? Is it 'cf'?
if ($tag->[1]{class} and $tag->[1]{class} eq "cf") {
# Store everything from <a> to </a>.
my $result = $stream->get_trimmed_text("/a");
# Remove leading.
# '&nbsp;' character.
$result =~ s/^.//g;
# Echocloud sometimes returns the artist we searched
# for as one of the results. Skip the current loop
# if the string given matches one of the results.
next if $result =~ /$artist/i;
# And we can print our final result.
print " - $result\n";
}
}
\ No newline at end of file
#!/usr/bin/perl -w
use strict;
$|++;
use File::Basename;
use WWW::Mechanize 0.48;
my $mech = WWW::Mechanize->new( );
# Get the starting search page
$mech->get( "http://search.cpan.org" );
$mech->success or die $mech->response->status_line;
# Select the form, fill the fields, and submit
$mech->form_number( 1 );
$mech->field( query => "Lester" );
$mech->field( mode => "author" );
$mech->submit( );
$mech->success or die "post failed: ",
$mech->response->status_line;
# Find the link for "Andy"
$mech->follow_link( text_regex => qr/Andy/ );
$mech->success or die "post failed: ", $mech->response->status_line;
# Get all the tarbulls
my @links = $mech->find_all_links( url_regex => qr/\.tar\.gz$/ );
my @urls = map { $_->[0] } @links;
print "Found ", scalar @urls, " tarballs to download\n";
for my $url ( @urls ) {
my $filename = basename( $url );
print "$filename --> ";
$mech->get( $url, ':content_file'=>$filename );
print -s $filename, " bytes\n";
\ No newline at end of file
#!/usr/bin/perl -w
use strict;
use WWW::Mechanize;
use HTML::TokeParser;
# the address you registered
# with Radio Times site here.
my $email = 'your email address';
die "Must provide an email address" unless $email ne '';
# We create a WWW::Mechanize object and tell it the address of the site
# we'll be working from. The Radio Times' front page has an image link
# with an ALT text of "My Diary", so we can use that to get to the right
# section of the site:
my $agent = WWW::Mechanize->new( );
$agent->get("http://www.radiotimes.beeb.com/");
$agent->follow("My Diary");
# The returned page contains two forms - one to allow you to choose from a
# list box of program types, and then a login form for the diary
# function. We tell WWW::Mechanize to use the second form for input.
# (Something to remember here is that WWW::Mechanize's list of forms,
# unlike an array in Perl, is indexed starting at 1 rather than 0.
# Therefore, our index is '2'.)
$agent->form(2);
# Now we can fill in our email address for the '<INPUT name="email"
# type="text">' field and click the submit button. Nothing too
# complicated here.
$agent->field("email", $email);
$agent->click( );
# WWW::Mechanize moves us on to our Diary page. This is the page
# we need to process to find the date details. On looking at the
# HTML source for this page, we can see the HTML we need to work
# through is something like:
#
# <input>
# <tr><td></td></tr>
# <tr><td></td><td></td><td class="bluetext">Date of episode</td></tr>
# <td></td><td></td>
# <td class="bluetext"><b>Time of episode</b></td></tr>
# <a href="page_with_episode_info"></a>
#
# This can be modelled with HTML::TokeParser as below. The important
# methods to note are get_tag, which will move the stream on to the
# next start of the tag given, and get_trimmed_text, which will take
# the text between the current tag and a given tag. For example, for the
# HTML code "<b>Bold text here</b>", my $tag = get_trimmed_text("/b")
# would return "Bold text here" to $tag.
# Also note that we're initializing HTML::TokeParser on
# '\$agent->{content}' - this is an internal variable for WWW::Mechanize,
# exposing the HTML content of the current page.
my $stream = HTML::TokeParser->new(\$agent->{content});
my $date; # will hold the current show's datestamp.
# <input>
$stream->get_tag("input");
# <tr><td></td></tr><tr>
$stream->get_tag("tr"); $stream->get_tag("tr");
# <td></td><td></td>
$stream->get_tag("td"); $stream->get_tag("td");
# <td class="bluetext">Date of episode</td></tr>
my $tag = $stream->get_tag("td");
if ($tag->[1]{class} and $tag->[1]{class} eq "bluetext") {
$date = $stream->get_trimmed_text("/td");
# The date contains '&nbsp;', which we'll translate to a space.
$date =~ s/\xa0/ /g;
}
# <td></td><td></td>
$stream->get_tag("td");
# <td class="bluetext"><b>Time of episode</b>
$tag = $stream->get_tag("td");
if ($tag->[1]{class} eq "bluetext") {
$stream->get_tag("b");
# This concatenates the time of the showing to the date.
$date .= ", from " . $stream->get_trimmed_text("/b");
}
# </td></tr><a href="page_with_episode_info"></a>
$tag = $stream->get_tag("a");
# Match the URL to find the page giving episode information.
$tag->[1]{href} =~ m!src=(http://.*?)'!;
my $show = $stream->get_trimmed_text("a");
# We have a scalar, $date, containing a string that looks something like
# "Thursday 23 January, from 6:45pm to 7:30pm.", and we have a URL, in
# $1, that will tell us more about that episode. We tell WWW::Mechanize
# to go to the URL:
$agent->get($1);
# The navigation we want to perform on this page is far less complex than
# on the last page, so we can avoid using a TokeParser for it - a regular
# expression should suffice. The HTML we want to parse looks something
# like this:
#
# <br><b>Episode</b><br> The Episode Title<br>
#
# We use a regex delimited with '!' in order to avoid having to escape the
# slashes present in the HTML, and store any number of alphanumeric
# characters after some whitespace, all in between <br> tags after the
# Episode header:
$agent->{content} =~ m!<br><b>Episode</b><br>\s+?(\w+?)<br>!;
# $1 now contains our episode, and all that's
# left to do is print out what we've found:
my $episode = $1;
print "The next Buffy episode ($episode) is on $date.\n";
\ No newline at end of file
#!/usr/bin/perl -w
use strict;
my (@s) = m{
> # close of previous tag
([^<]+) # text (name of part, e.g., q/BLACK CARTRIDGE/)
<br>
([^<]+) # part number (e.g., q/HP Part Number: HP C9724A/+)
(?:<[^>]+>\s*){4} # separated by four tags
(\d+) # percent remaining
| # --or--
(?:
# different text values
(?:
Pages\sRemaining
| Low\sReached
| Serial\sNumber
| Pages\sprinted\swith\sthis\ssupply
) : (?:\s*<[^>]+>){6}\s* # colon, separated by six tags
# or just this, within the current element
| Based\son\shistorical\s\S+\spage\scoverage\sof\s
)
(\w+) # and the value we want
}gx;
#!/usr/bin/perl -w
use strict;
use utf8;
use LWP::Simple;
use XML::LibXML;
use URI;
# Set up the parser, and set it to recover
# from errors so that it can handle broken HTML
my $parser = XML::LibXML->new( ); $parser->recover(1);
# Parse the page into a DOM tree structure
my $url = 'http://junglescan.com/';
my $data = get($url) or die $!;
my $doc = $parser->parse_html_string($data);
# Extract the table rows (as an
# array of references to DOM nodes)
my @winners = $doc->findnodes(q{
/html/body/table/tr/td[1]/font/form[2]/table[2]/tr
});
# The first two rows contain headings,
# and we want only the top five, so slice.
@winners = @winners[2..6];
foreach my $product (@winners) {
# Get the percentage change and type
# We use the find method since we only need strings
my $change = $product->find('td[4]');
my $type = $product->find('td[3]//img/@alt');
# Get the title. It has some annoying
# whitespace, so we trim that off with regexes.
my $title = $product->find('td[3]//tr[1]');
$title =~ s/^\s*//; $title =~ s/\xa0$//;
# Get the first link ("Visit Amazon.com page")
# This is relative to the page's URL, so we make it absolute
my $relurl = $product->find('td[3]//a[1]/@href');
my $absurl = URI->new($relurl)->abs($url);
# Output. There isn't always a type, so we ignore it if there isn't.
print "$change $title";
print " [$type]" if $type;
print "\n Amazon info: $absurl\n\n";
}
\ No newline at end of file
#!/usr/bin/perl -w
#
# translate.pl - translates the output of wsp.pl -v.
#
# This code is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
use strict;
my $save_url;
my $count = 1;
# Print the basics
print "#!/usr/bin/perl\n";
print "use warnings;\n";
print "use strict;\n";
print "use LWP::UserAgent;\n";
print "my \$ua = LWP::UserAgent->new;\n\n";
# read through wsp's output.
while (<>) {
chomp; s/\x0D$//;
# add our HTTP request headers...
if (/^INPUT: ([a-zA-Z0-9\-\_]+): (.*)$/) {
print '$req'.$count.'->header(\''.$1."' => '".$2."');\n";
}
# what URL we're actually requesting...
if (/^Request for URL: (.*)$/) { $save_url=$1; }
# the HTTP 1.x request line (GET or POST).
if (/^FIRST LINE: ([A-Z]+) \S+ (.*)$/) {
print "\n\n### request number $count ###\n";
print "my \$req$count = HTTP::Request->new($1 => '$save_url');\n";
}
# the POST information sent off, if any.
if (/^POST body: (.*)$/) { print "\$req$count->content('$1');\n"; }
# and finish up our request.
if (/^ --- Done sending./) {
print "print \$ua->request(\$req$count)->as_string;\n";
$count++; # move on to our next request. yeedawg.
}
}
#!/usr/bin/perl -w
#
# LeechGrounds - saves flash files from Newgrounds.com.
# Part of the Leecharoo suite - for all those hard to leech places.
# http://disobey.com/d/code/ or contact morbus@disobey.com.
#
# This code is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
use strict; $|++;
my $VERSION = "1.0";
use File::Spec::Functions;
# make sure we have the modules we need, else die peacefully.
eval("use LWP 5.6.9;"); die "[err] LWP 5.6.9 or greater required.\n" if $@;
# our download URLs are found in this URL (which'll
# be tweaked with the date and ID we care about).
my $base_url = "http://newgrounds.com/portal/content.php";
my $dir = "newgrounds"; # save downloads to...?
mkdir $dir; # make sure that dir exists.
my $date; # date from newgrounds server.
# create a final hash that contains
# all the IDs we'll be downloading.
my %ids; foreach (@ARGV) {
next unless /\d/; # numbers only.
# if it's a range, work through it.
if (/(\d+)-(\d+)/) {
my $start = $1; my $end = $2;
for (my $i = $start; $i <= $end; $i++) {
$ids{$i} = undef; # alive, alive!
}
} else { $ids{$_} = undef; } # normal number.
}
# create a downloader, faking the User-Agent to get past filters.
my $ua = LWP::UserAgent->new(agent => 'Mozilla/4.76 [en] (Win98; U)');
# now that we have a list of IDs we want to
# download, get the date value from first page.
# we'll use this to get the final download URLs.
print "-" x 76, "\n"; # pretty visual seperator.
foreach my $id (sort {$a <=> $b} keys %ids) {
# get the date first time through.
unless ($date) {
print "Trying to grab a date string from $id... ";
my $response = $ua->get("http://newgrounds.com/portal/view.php?id=$id");
my $data = $response->content; $data =~ /&date=(\d+)&quality=b/;
unless ($1) { print "bah!\n"; next; } print "yes!\n";
$date = $1; # store the date for later use.
}
# now, we can get the download URL to our Flash file.
# note that we get ALL the download URLs before we
# actually download. this saves us from having to
# error check when we're out-of-date on long downloads.
print "Determining download URL for $id... ";
my $response = $ua->get("$base_url?id=$id&date=$date");
my $data = $response->content; # our content.
$data =~ /uploads.newgrounds.com\/(.*swf)/;
$ids{$id} = "http://uploads.newgrounds.com/$1";
print "done!\n";
} print "-" x 76, "\n"; # pretty!
# if we're here, we have our URLs to download in
# our hash, so we just run through the basics now.
foreach my $id (sort {$a <=> $b} keys %ids) {
# only work on IDs with URLs.
next unless defined ($ids{$id});
# get URL/filename.
my $url = $ids{$id}; $url =~ /([^\/]*.swf)/;
my $filename = $1; print "Downloading $filename... ";
# and use :content_file to autosave to our directory.
$ua->get($url, ':content_file' => "$dir/$filename");
print "done!\n"; # easier said than don... oh, nevermind.
}
\ No newline at end of file
#!/bin/sh
#
# LeechiFilm - saves movies from iFilm.com.
# Part of the Leecharoo suite - for all those hard to leech places.
# http://disobey.com/d/code/ or contact morbus@disobey.com.
#
# This code is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
for id in $*; do
f56="http://anon.ifilm.speedera.net/anon.ifilm/qt/portal/${id}_56.mov"
f200="http://anon.ifilm.speedera.net/anon.ifilm/qt/portal/${id}_200.mov"
f500="http://anon.ifilm.speedera.net/anon.ifilm/qt/portal/${id}_500.mov"
wget -c $f500 || wget -c $f200 || wget -c $f56
done
\ No newline at end of file
#!/usr/bin/perl -w
use strict;
use WWW::Mechanize;
use Getopt::Long;
my $max = 10;
GetOptions(
"max=i" => \$max,
);
my $search = shift or die "Must specify a search term";
my $w = WWW::Mechanize->new;
$w->get( "http://www.webshots.com/explore/" );
$w->success or die "Can't read the search page!\n";
$w->submit_form(
form_number => 1,
fields => { words => $search },
);
$w->success or die "Search failed!\n";
# execution of script stops if warning
# about adult content is returned.
if ( $w->content =~ /Adult content/i ) {
die "Search term probably returns adult content\n";
}
my $ndownloads = 0;
NEXT_PAGE_LOOP: while(1) {
$w->content =~ /Page (\d+) of (\d+)/
or warn "Can't find page count\n";
warn "On page $1 of $2...\n";
# Pull the "Next" link off before we download pictures
my $nextlink = $w->find_link( text => "Next >" );
my $currpage = $w->uri;
my @links = $w->find_all_links( url_regex =>
qr[http://community.webshots.com/photo/] );
for my $link ( @links ) {
my $url = $link->url;
my $text = $link->text;
next if $text eq "[IMG]";
$w->get( $url );
$w->success or die "Couldn't fetch $url";
if ($w->content=~m[(http://community\.webshots\.com/.+?\.(jpg|gif|png))]) {
my $imgurl = $1; my $type = $2;
# Make a name based on the webshots title for the pic
my $filename = lc $text; # Lowercase everything
$filename =~ s/\s+/-/g; # Spaces become dashes
$filename =~ s/[^0-9a-z-]+//g; # Strip all nonalphanumeric
$filename =~ s/(^-|-$)//; # Strip leading/trailing dashes
$filename = "$filename.$type";
# Bring down the image if we don't already have it
if ( -e $filename ) { warn "Already have $filename\n"; }
else {
# use LWP's :content_file to save our
# image directly to the filesystem,
# instead of processing it ourselves.
warn "Saving $filename...\n";
$w->get( $imgurl, ":content_file"=>$filename );
++$ndownloads; last if $ndownloads >= $max;
}
} else { warn "Couldn't find an image on $url\n"; }
}
last unless $nextlink && ($ndownloads<$max);
my $nexturl = URI->new_abs( $nextlink->url, $currpage )->as_string;
$w->get( $nexturl ); die "$nexturl failed!\n" unless $w->success;
}
\ No newline at end of file
#!/usr/bin/python
"""
getcams.py - Archiving Your Favorite Web Cams
Sean B. Palmer, <http://purl.org/net/sbp/>, 2003-07.
License: GPL 2; share and enjoy!
Usage:
python getcams.py [ <filename> ]
<filename> defaults to URIs.txt
"""
import urllib2, time
from urllib import quote
from email.Utils import parsedate
# # # # # # # # # # # # # # # # #
# Configurable stuff
#
# download how often, in seconds
seconds = 15
# what file we should write to
index = 'webcams.html'