#!/usr/bin/perl -w use strict; use WWW::Mechanize; use HTML::TokeParser; # the address you registered # with Radio Times site here. my $email = 'your email address'; die "Must provide an email address" unless $email ne ''; # We create a WWW::Mechanize object and tell it the address of the site # we'll be working from. The Radio Times' front page has an image link # with an ALT text of "My Diary", so we can use that to get to the right # section of the site: my $agent = WWW::Mechanize->new( ); $agent->get("http://www.radiotimes.beeb.com/"); $agent->follow("My Diary"); # The returned page contains two forms - one to allow you to choose from a # list box of program types, and then a login form for the diary # function. We tell WWW::Mechanize to use the second form for input. # (Something to remember here is that WWW::Mechanize's list of forms, # unlike an array in Perl, is indexed starting at 1 rather than 0. # Therefore, our index is '2'.) $agent->form(2); # Now we can fill in our email address for the '' field and click the submit button. Nothing too # complicated here. $agent->field("email", $email); $agent->click( ); # WWW::Mechanize moves us on to our Diary page. This is the page # we need to process to find the date details. On looking at the # HTML source for this page, we can see the HTML we need to work # through is something like: # # # # Date of episode # # Time of episode # # # This can be modelled with HTML::TokeParser as below. The important # methods to note are get_tag, which will move the stream on to the # next start of the tag given, and get_trimmed_text, which will take # the text between the current tag and a given tag. For example, for the # HTML code "Bold text here", my $tag = get_trimmed_text("/b") # would return "Bold text here" to $tag. # Also note that we're initializing HTML::TokeParser on # '\$agent->{content}' - this is an internal variable for WWW::Mechanize, # exposing the HTML content of the current page. my $stream = HTML::TokeParser->new(\$agent->{content}); my $date; # will hold the current show's datestamp. # $stream->get_tag("input"); # $stream->get_tag("tr"); $stream->get_tag("tr"); # $stream->get_tag("td"); $stream->get_tag("td"); # Date of episode my $tag = $stream->get_tag("td"); if ($tag->[1]{class} and $tag->[1]{class} eq "bluetext") { $date = $stream->get_trimmed_text("/td"); # The date contains ' ', which we'll translate to a space. $date =~ s/\xa0/ /g; } # $stream->get_tag("td"); # Time of episode $tag = $stream->get_tag("td"); if ($tag->[1]{class} eq "bluetext") { $stream->get_tag("b"); # This concatenates the time of the showing to the date. $date .= ", from " . $stream->get_trimmed_text("/b"); } # $tag = $stream->get_tag("a"); # Match the URL to find the page giving episode information. $tag->[1]{href} =~ m!src=(http://.*?)'!; my $show = $stream->get_trimmed_text("a"); # We have a scalar, $date, containing a string that looks something like # "Thursday 23 January, from 6:45pm to 7:30pm.", and we have a URL, in # $1, that will tell us more about that episode. We tell WWW::Mechanize # to go to the URL: $agent->get($1); # The navigation we want to perform on this page is far less complex than # on the last page, so we can avoid using a TokeParser for it - a regular # expression should suffice. The HTML we want to parse looks something # like this: # #
Episode
The Episode Title
# # We use a regex delimited with '!' in order to avoid having to escape the # slashes present in the HTML, and store any number of alphanumeric # characters after some whitespace, all in between
tags after the # Episode header: $agent->{content} =~ m!
Episode
\s+?(\w+?)
!; # $1 now contains our episode, and all that's # left to do is print out what we've found: my $episode = $1; print "The next Buffy episode ($episode) is on $date.\n";