email-opt.pl 5.8 KB
O'Reilly Media, Inc. committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
#
# Program to build a regex to match an internet email address,
# from Chapter 7 of _Mastering Regular Expressions_ (Friedl / O'Reilly)
# (http://www.oreilly.com/catalog/regex/)
#
# Optimized version.
#
# Copyright 1997 O'Reilly & Associates, Inc.
#



# Some things for avoiding backslashitis later on.
$esc        = '\\\\';               $Period      = '\.';
$space      = '\040';               $tab         = '\t';
$OpenBR     = '\[';                 $CloseBR     = '\]';
$OpenParen  = '\(';                 $CloseParen  = '\)';
$NonASCII   = '\x80-\xff';          $ctrl        = '\000-\037';
$CRlist     = '\n\015';  # note: this should really be only \015.

# Items 19, 20, 21
$qtext = qq/[^$esc$NonASCII$CRlist\"]/;               # for within "..."
$dtext = qq/[^$esc$NonASCII$CRlist$OpenBR$CloseBR]/;  # for within [...]
$quoted_pair = qq< $esc [^$NonASCII] >; # an escaped character

##############################################################################
# Items 22 and 23, comment.
# Impossible to do properly with a regex, I make do by allowing at most one level of nesting.
$ctext   = qq< [^$esc$NonASCII$CRlist()] >;

# $Cnested matches one non-nested comment.
# It is unrolled, with normal of $ctext, special of $quoted_pair.
$Cnested = qq<
   $OpenParen                            #  (
      $ctext*                            #     normal*
      (?: $quoted_pair $ctext* )*        #     (special normal*)*
   $CloseParen                           #                       )
>;

# $comment allows one level of nested parentheses
# It is unrolled, with normal of $ctext, special of ($quoted_pair|$Cnested)
$comment = qq<
   $OpenParen                              #  (
       $ctext*                             #     normal*
       (?:                                 #       (
          (?: $quoted_pair | $Cnested )    #         special
           $ctext*                         #         normal*
       )*                                  #            )*
   $CloseParen                             #                )
>;

##############################################################################

# $X is optional whitespace/comments.
$X = qq<
   [$space$tab]*                    # Nab whitespace.
   (?: $comment [$space$tab]* )*    # If comment found, allow more spaces.
>;



# Item 10: atom
$atom_char   = qq/[^($space)<>\@,;:\".$esc$OpenBR$CloseBR$ctrl$NonASCII]/;
$atom = qq<
  $atom_char+    # some number of atom characters...
  (?!$atom_char) # ..not followed by something that could be part of an atom
>;

# Item 11: doublequoted string, unrolled.
$quoted_str = qq<
    \"                                     # "
       $qtext *                            #   normal
       (?: $quoted_pair $qtext * )*        #   ( special normal* )*
    \"                                     #        "
>;

# Item 7: word is an atom or quoted string
$word = qq<
    (?:
       $atom                 # Atom
       |                       #  or
       $quoted_str           # Quoted string
     )
>;

# Item 12: domain-ref is just an atom
$domain_ref  = $atom;

# Item 13: domain-literal is like a quoted string, but [...] instead of  "..."
$domain_lit  = qq<
    $OpenBR                            # [
    (?: $dtext | $quoted_pair )*     #    stuff
    $CloseBR                           #           ]
>;

# Item 9: sub-domain is a domain-ref or domain-literal
$sub_domain  = qq<
  (?:
    $domain_ref
    |
    $domain_lit
   )
   $X # optional trailing comments
>;

# Item 6: domain is a list of subdomains separated by dots.
$domain = qq<
     $sub_domain
     (?:
        $Period $X $sub_domain
     )*
>;

# Item 8: a route. A bunch of "@ $domain" separated by commas, followed by a colon.
$route = qq<
    \@ $X $domain
    (?: , $X \@ $X $domain )*  # additional domains
    :
    $X # optional trailing comments
>;

# Item 6: local-part is a bunch of $word separated by periods
$local_part = qq<
    $word $X
    (?:
        $Period $X $word $X # additional words
    )*
>;

# Item 2: addr-spec is local@domain
$addr_spec  = qq<
  $local_part \@ $X $domain
>;

# Item 4: route-addr is <route? addr-spec>
$route_addr = qq[
    < $X                 # <
       (?: $route )?     #       optional route
       $addr_spec        #       address spec
    >                    #                 >
];


# Item 3: phrase........
$phrase_ctrl = '\000-\010\012-\037'; # like ctrl, but without tab

# Like atom-char, but without listing space, and uses phrase_ctrl.
# Since the class is negated, this matches the same as atom-char plus space and tab
$phrase_char =
   qq/[^()<>\@,;:\".$esc$OpenBR$CloseBR$NonASCII$phrase_ctrl]/;

# We've worked it so that $word, $comment, and $quoted_str to not consume trailing $X
# because we take care of it manually.
$phrase = qq<
   $word                        # leading word
   $phrase_char *               # "normal" atoms and/or spaces
   (?:
      (?: $comment | $quoted_str ) # "special" comment or quoted string
      $phrase_char *            #  more "normal"
   )*
>;

## Item #1: mailbox is an addr_spec or a phrase/route_addr
$mailbox = qq<
    $X                                  # optional leading comment
    (?:
            $addr_spec                  # address
            |                             #  or
            $phrase  $route_addr      # name and address
     )
>;



###########################################################################
# Here's a little snippet to test it.
# Addresses given on the commandline are described.
#

my $error = 0;
my $valid;
foreach $address (@ARGV) {
    $valid = $address =~ m/^$mailbox$/xo;
    printf "`$address' is syntactically %s.\n", $valid ? "valid" : "invalid";
    $error = 1 if not $valid;
}
exit $error;