Get-PageUrls.ps1 3.93 KB
Newer Older
O'Reilly Media, Inc.'s avatar
O'Reilly Media, Inc. committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
##############################################################################
##
## Get-PageUrls
##
## From Windows PowerShell Cookbook (O'Reilly)
## by Lee Holmes (http://www.leeholmes.com/guide)
##############################################################################

<#

.SYNOPSIS

Parse all of the URLs out of a given file.

.EXAMPLE

Get-PageUrls microsoft.html http://www.microsoft.com
Gets all of the URLs from HTML stored in microsoft.html, and converts relative
URLs to the domain of http://www.microsoft.com

.EXAMPLE

Get-PageUrls microsoft.html http://www.microsoft.com 'aspx$'
Gets all of the URLs from HTML stored in microsoft.html, converts relative
URLs to the domain of http://www.microsoft.com, and returns only URLs that end
in 'aspx'.

#>

param(
    ## The filename to parse
    [Parameter(Mandatory = $true)]
    [string] $Path,

    ## The URL from which you downloaded the page.
    ## For example, http://www.microsoft.com
    [Parameter(Mandatory = $true)]
    [string] $BaseUrl,

    [switch] $Images,
    
    ## The Regular Expression pattern with which to filter
    ## the returned URLs
    [string] $Pattern = ".*"
)

Set-StrictMode -Version Latest

## Load the System.Web DLL so that we can decode URLs
Add-Type -Assembly System.Web

## Defines the regular expression that will parse an URL
## out of an anchor tag.
$regex = "<\s*a\s*[^>]*?href\s*=\s*[`"']*([^`"'>]+)[^>]*?>"
if($Images)
{
    $regex = "<\s*img\s*[^>]*?src\s*=\s*[`"']*([^`"'>]+)[^>]*?>"
}

## Parse the file for links
function Main
{
    ## Do some minimal source URL fixups, by switching backslashes to
    ## forward slashes
    $baseUrl = $baseUrl.Replace("\", "/")

    if($baseUrl.IndexOf("://") -lt 0)
    {
        throw "Please specify a base URL in the form of " +
            "http://server/path_to_file/file.html"
    }

    ## Determine the server from which the file originated.  This will
    ## help us resolve links such as "/somefile.zip"
    $baseUrl = $baseUrl.Substring(0, $baseUrl.LastIndexOf("/") + 1)
    $baseSlash = $baseUrl.IndexOf("/", $baseUrl.IndexOf("://") + 3)

    if($baseSlash -ge 0)
    {
        $domain = $baseUrl.Substring(0, $baseSlash)
    }
    else
    {
        $domain = $baseUrl
    }


    ## Put all of the file content into a big string, and
    ## get the regular expression matches
    $content = [String]::Join(' ', (Get-Content $path))
    $contentMatches = @(GetMatches $content $regex)

    foreach($contentMatch in $contentMatches)
    {
        if(-not ($contentMatch -match $pattern)) { continue }
        if($contentMatch -match "javascript:") { continue }

        $contentMatch = $contentMatch.Replace("\", "/")

        ## Hrefs may look like:
        ## ./file
        ## file
        ## ../../../file
        ## /file
        ## url
        ## We'll keep all of the relative paths, as they will resolve.
        ## We only need to resolve the ones pointing to the root.
        if($contentMatch.IndexOf("://") -gt 0)
        {
            $url = $contentMatch
        }
        elseif($contentMatch[0] -eq "/")
        {
            $url = "$domain$contentMatch"
        }
        else
        {
            $url = "$baseUrl$contentMatch"
            $url = $url.Replace("/./", "/")
        }

        ## Return the URL, after first removing any HTML entities
        [System.Web.HttpUtility]::HtmlDecode($url)
    }
}

function GetMatches([string] $content, [string] $regex)
{
    $returnMatches = new-object System.Collections.ArrayList

    ## Match the regular expression against the content, and
    ## add all trimmed matches to our return list
    $resultingMatches = [Regex]::Matches($content, $regex, "IgnoreCase")
    foreach($match in $resultingMatches)
    {
        $cleanedMatch = $match.Groups[1].Value.Trim()
        [void] $returnMatches.Add($cleanedMatch)
    }

    $returnMatches
}

. Main