Commit aa29e4c1 authored by O'Reilly Media, Inc.'s avatar O'Reilly Media, Inc.

Initial commit

parents
9780596000806
\ No newline at end of file
This diff is collapsed.
## Example files for the title:
# Beginning Perl for Bioinformatics, by James Tisdall
[![Beginning Perl for Bioinformatics, by James Tisdall](http://akamaicovers.oreilly.com/images/9780596000806/cat.gif)](https://www.safaribooksonline.com/library/view/title/0596000804//)
The following applies to example files from material published by O’Reilly Media, Inc. Content from other publishers may include different rules of usage. Please refer to any additional usage rights explained in the actual example files or refer to the publisher’s website.
O'Reilly books are here to help you get your job done. In general, you may use the code in O'Reilly books in your programs and documentation. You do not need to contact us for permission unless you're reproducing a significant portion of the code. For example, writing a program that uses several chunks of code from our books does not require permission. Answering a question by citing our books and quoting example code does not require permission. On the other hand, selling or distributing a CD-ROM of examples from O'Reilly books does require permission. Incorporating a significant amount of example code from our books into your product's documentation does require permission.
We appreciate, but do not require, attribution. An attribution usually includes the title, author, publisher, and ISBN.
If you think your use of code examples falls outside fair use or the permission given here, feel free to contact us at <permissions@oreilly.com>.
Please note that the examples are not production code and have not been carefully testing. They are provided "as-is" and come with no warranty of any kind.
This diff is collapsed.
MNIDDKLEGLFLKCGGIDEMQSSRTMVVMGGVSGQSTVSGELQD
SVLQDRSMPHQEILAADEVLQESEMRQQDMISHDELMVHEETVKNDEEQMETHERLPQ
GLQYALNVPISVKQEITFTDVSEQLMRDKKQIR
REBASE version 104 bionet.104
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
REBASE, The Restriction Enzyme Database http://rebase.neb.com
Copyright (c) Dr. Richard J. Roberts, 2001. All rights reserved.
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Rich Roberts Mar 30 2001
AaaI (XmaIII) C^GGCCG
AacI (BamHI) GGATCC
AaeI (BamHI) GGATCC
AagI (ClaI) AT^CGAT
AaqI (ApaLI) GTGCAC
AarI CACCTGCNNNN^
AarI ^NNNNNNNNGCAGGTG
AatI (StuI) AGG^CCT
AatII GACGT^C
AauI (Bsp1407I) T^GTACA
AbaI (BclI) T^GATCA
AbeI (BbvCI) CC^TCAGC
AbeI (BbvCI) GC^TGAGG
AbrI (XhoI) C^TCGAG
AcaI (AsuII) TTCGAA
AcaII (BamHI) GGATCC
AcaIII (MstI) TGCGCA
AcaIV (HaeIII) GGCC
AccI GT^MKAC
AccII (FnuDII) CG^CG
AccIII (BspMII) T^CCGGA
Acc16I (MstI) TGC^GCA
Acc36I (BspMI) ACCTGCNNNN^
Acc36I (BspMI) ^NNNNNNNNGCAGGT
Acc38I (EcoRII) CCWGG
Acc65I (KpnI) G^GTACC
Acc113I (ScaI) AGT^ACT
AccB1I (HgiCI) G^GYRCC
AccB2I (HaeII) RGCGC^Y
AccB7I (PflMI) CCANNNN^NTGG
AccBSI (BsrBI) CCG^CTC
AccBSI (BsrBI) GAG^CGG
AccEBI (BamHI) G^GATCC
AceI (TseI) G^CWGC
AceII (NheI) GCTAG^C
AceIII CAGCTCNNNNNNN^
AceIII ^NNNNNNNNNNNGAGCTG
AciI C^CGC
AciI G^CGG
AclI AA^CGTT
AclNI (SpeI) A^CTAGT
AclWI (BinI) GGATCNNNN^
This diff is collapsed.
#!/usr/bin/perl
# Example 10-1 Extract annotation and sequence from GenBank file
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# declare and initialize variables
my @annotation = ( );
my $sequence = '';
my $filename = 'record.gb';
parse1(\@annotation, \$sequence, $filename);
# Print the annotation, and then
# print the DNA in new format just to check if we got it okay.
print @annotation;
print_sequence($sequence, 50);
exit;
################################################################################
# Subroutine
################################################################################
# parse1
#
# -parse annotation and sequence from GenBank record
sub parse1 {
my($annotation, $dna, $filename) = @_;
# $annotation-reference to array
# $dna -reference to scalar
# $filename -scalar
# declare and initialize variables
my $in_sequence = 0;
my @GenBankFile = ( );
# Get the GenBank data into an array from a file
@GenBankFile = get_file_data($filename);
# Extract all the sequence lines
foreach my $line (@GenBankFile) {
if( $line =~ /^\/\/\n/ ) { # If $line is end-of-record line //\n,
last; #break out of the foreach loop.
} elsif( $in_sequence) { # If we know we're in a sequence,
$$dna .= $line; # add the current line to $$dna.
} elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence,
$in_sequence = 1; # set the $in_sequence flag.
} else{ # Otherwise
push( @$annotation, $line); # add the current line to @annotation.
}
}
# remove whitespace and line numbers from DNA sequence
$$dna =~ s/[\s0-9]//g;
}
#!/usr/bin/perl
# Example 10-2 Extract the annotation and sequence sections from the first
# record of a GenBank library
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my $annotation = '';
my $dna = '';
my $record = '';
my $filename = 'record.gb';
my $save_input_separator = $/;
# Open GenBank library file
unless (open(GBFILE, $filename)) {
print "Cannot open GenBank file \"$filename\"\n\n";
exit;
}
# Set input separator to "//\n" and read in a record to a scalar
$/ = "//\n";
$record = <GBFILE>;
# reset input separator
$/ = $save_input_separator;
# Now separate the annotation from the sequence data
($annotation, $dna) = ($record =~ /^(LOCUS.*ORIGIN\s*\n)(.*)\/\/\n/s);
# Print the two pieces, which should give us the same as the
# original GenBank file, minus the // at the end
print $annotation, $dna;
exit;
#!/usr/bin/perl -w
# Example 10-3 Parsing GenBank annotations using arrays
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my @genbank = ( );
my $locus = '';
my $accession = '';
my $organism = '';
# Get GenBank file data
@genbank = get_file_data('record.gb');
# Let's start with something simple. Let's get some of the identifying
# information, let's say the locus and accession number (here the same
# thing) and the definition and the organism.
for my $line (@genbank) {
if($line =~ /^LOCUS/) {
$line =~ s/^LOCUS\s*//;
$locus = $line;
}elsif($line =~ /^ACCESSION/) {
$line =~ s/^ACCESSION\s*//;
$accession = $line;
}elsif($line =~ /^ ORGANISM/) {
$line =~ s/^\s*ORGANISM\s*//;
$organism = $line;
}
}
print "*** LOCUS ***\n";
print $locus;
print "*** ACCESSION ***\n";
print $accession;
print "*** ORGANISM ***\n";
print $organism;
exit;
#!/usr/bin/perl -w
# Example 10-4 Parsing GenBank annotations using arrays, take 2
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my @genbank = ( );
my $locus = '';
my $accession = '';
my $organism = '';
my $definition = '';
my $flag = 0;
# Get GenBank file data
@genbank = get_file_data('record.gb');
# Let's start with something simple. Let's get some of the identifying
# information, let's say the locus and accession number (here the same
# thing) and the definition and the organism.
for my $line (@genbank) {
if($line =~ /^LOCUS/) {
$line =~ s/^LOCUS\s*//;
$locus = $line;
}elsif($line =~ /^DEFINITION/) {
$line =~ s/^DEFINITION\s*//;
$definition = $line;
$flag = 1;
}elsif($line =~ /^ACCESSION/) {
$line =~ s/^ACCESSION\s*//;
$accession = $line;
$flag = 0;
}elsif($flag) {
chomp($definition);
$definition .= $line;
}elsif($line =~ /^ ORGANISM/) {
$line =~ s/^\s*ORGANISM\s*//;
$organism = $line;
}
}
print "*** LOCUS ***\n";
print $locus;
print "*** DEFINITION ***\n";
print $definition;
print "*** ACCESSION ***\n";
print $accession;
print "*** ORGANISM ***\n";
print $organism;
exit;
#!/usr/bin/perl
# Example 10-5 - test program of GenBank library subroutines)
use strict;
use warnings;
# Don't use BeginPerlBioinfo
# Since all subroutines defined in this file
# use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my $fh; # variable to store filehandle
my $record;
my $dna;
my $annotation;
my $offset;
my $library = 'library.gb';
# Perform some standard subroutines for test
$fh = open_file($library);
$offset = tell($fh);
while( $record = get_next_record($fh) ) {
($annotation, $dna) = get_annotation_and_dna($record);
if( search_sequence($dna, 'AAA[CG].')) {
print "Sequence found in record at offset $offset\n";
}
if( search_annotation($annotation, 'homo sapiens')) {
print "Annotation found in record at offset $offset\n";
}
$offset = tell($fh);
}
exit;
################################################################################
# Subroutines
################################################################################
# open_file
#
# - given filename, set filehandle
sub open_file {
my($filename) = @_;
my $fh;
unless(open($fh, $filename)) {
print "Cannot open file $filename\n";
exit;
}
return $fh;
}
# get_next_record
#
# - given GenBank record, get annotation and DNA
sub get_next_record {
my($fh) = @_;
my($offset);
my($record) = '';
my($save_input_separator) = $/;
$/ = "//\n";
$record = <$fh>;
$/ = $save_input_separator;
return $record;
}
# get_annotation_and_dna
#
# - given filehandle to open GenBank library file, get next record
sub get_annotation_and_dna {
my($record) = @_;
my($annotation) = '';
my($dna) = '';
# Now separate the annotation from the sequence data
($annotation, $dna) = ($record =~ /^(LOCUS.*ORIGIN\s*\n)(.*)\/\/\n/s);
# clean the sequence of any whitespace or / characters
# (the / has to be written \/ in the character class, because
# / is a metacharacter, so it must be "escaped" with \)
$dna =~ s/[\s\/]//g;
return($annotation, $dna)
}
# search_sequence
#
# - search sequence with regular expression
sub search_sequence {
my($sequence, $regularexpression) = @_;
my(@locations) = ( );
while( $sequence =~ /$regularexpression/ig ) {
push( @locations, pos );
}
return (@locations);
}
# search_annotation
#
# - search annotation with regular expression
sub search_annotation {
my($annotation, $regularexpression) = @_;
my(@locations) = ( );
# note the /s modifier-. matches any character including newline
while( $annotation =~ /$regularexpression/isg ) {
push( @locations, pos );
}
return (@locations);
}
#!/usr/bin/perl
# Example 10-6 - test program for parse_annotation subroutine
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my $fh;
my $record;
my $dna;
my $annotation;
my %fields;
my $library = 'library.gb';
# Open library and read a record
$fh = open_file($library);
$record = get_next_record($fh);
# Parse the sequence and annotation
($annotation, $dna) = get_annotation_and_dna($record);
# Extract the fields of the annotation
%fields = parse_annotation($annotation);
# Print the fields
foreach my $key (keys %fields) {
print "******** $key *********\n";
print $fields{$key};
}
exit;
################################################################################
# Subroutine
################################################################################
# parse_annotation
#
# given a GenBank annotation, returns a hash with
# keys: the field names
# values: the fields
sub parse_annotation {
my($annotation) = @_;
my(%results) = ( );
while( $annotation =~ /^[A-Z].*\n(^\s.*\n)*/gm ) {
my $value = $&;
(my $key = $value) =~ s/^([A-Z]+).*/$1/s;
$results{$key} = $value;
}
return %results;
}
#!/usr/bin/perl
# - main program to test parse_features
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my $fh;
my $record;
my $dna;
my $annotation;
my %fields;
my @features;
my $library = 'library.gb';
# Get the fields from the first GenBank record in a library
$fh = open_file($library);
$record = get_next_record($fh);
($annotation, $dna) = get_annotation_and_dna($record);
%fields = parse_annotation($annotation);
# Extract the features from the FEATURES table
@features = parse_features($fields{'FEATURES'});
# Print out the features
foreach my $feature (@features) {
# extract the name of the feature (or "feature key")
my($featurename) = ($feature =~ /^ {5}(\S+)/);
print "******** $featurename *********\n";
print $feature;
}
exit;
############################################################################
# Subroutine
############################################################################
# parse_features
#
# extract the features from the FEATURES field of a GenBank record
sub parse_features {
my($features) = @_; # entire FEATURES field in a scalar variable
# Declare and initialize variables
my(@features) = (); # used to store the individual features
# Extract the features
while( $features =~ /^ {5}\S.*\n(^ {21}\S.*\n)*/gm ) {
my $feature = $&;
push(@features, $feature);
}
return @features;
}
\ No newline at end of file
#!/usr/bin/perl
# Example 10-8 - make a DBM index of a GenBank library,
# and demonstrate its use interactively
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module
# Declare and initialize variables
my $fh;
my $record;
my $dna;
my $annotation;
my %fields;
my %dbm;
my $answer;
my $offset;
my $library = 'library.gb';
# open DBM file, creating if necessary
unless(dbmopen(%dbm, 'GB', 0644)) {
print "Cannot open DBM file GB with mode 0644\n";
exit;
}
# Parse GenBank library, saving accession number and offset in DBM file
$fh = open_file($library);
$offset = tell($fh);
while ( $record = get_next_record($fh) ) {
# Get accession field for this record.
($annotation, $dna) = get_annotation_and_dna($record);
%fields = parse_annotation($annotation);
my $accession = $fields{'ACCESSION'};
# extract just the accession number from the accession field
# -remove any trailing spaces
$accession =~ s/^ACCESSION\s*//;
$accession =~ s/\s*$//;
# store the key/value of accession/offset
$dbm{$accession} = $offset;
# get offset for next record
$offset = tell($fh);
}
# Now interactively query the DBM database with accession numbers
# to see associated records
print "Here are the available accession numbers:\n";
print join ( "\n", keys %dbm ), "\n";
print "Enter accession number (or quit): ";
while( $answer = <STDIN> ) {
chomp $answer;
if($answer =~ /^\s*q/) {
last;
}
$offset = $dbm{$answer};
if (defined $offset) {
seek($fh, $offset, 0);
$record = get_next_record($fh);
print $record;
}else{
print "Do not have an entry for accession number $answer\n";
}
print "\nEnter accession number (or quit): ";
}
dbmclose(%dbm);
close($fh);
exit;
#!/usr/bin/perl
# Example 10-8 - make a DBM index of a GenBank library,
# and demonstrate its use interactively
use strict;
use warnings;
use BeginPerlBioinfo; # see Chapter 6 about this module