While twirling my luscious locks and thinking of ways to input data for my haveibeenpwned-based query tool, I thought to myself, “self, how cool would it be to just generate a list of hyper-linked email addresses from a public website?”
Having already put together a pretty basic PowerShell scraper, I decided this wouldn’t be terribly hard. The scraper I wrote downloads all the page content to a variable through the use of this handy command:
$Data = Invoke-WebRequest -Uri $Site
From there, I look through the document links (stored in $data.Links.Href), and then loop through those to download them. Since hyper-linked email addresses appear in the Links document property, this would be easy–just have to drop in a regular expression to find email addresses and strip out the mailto: HREF:
[array]$EmailAddresses = @()
$regex = "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f] | \\[\x01-\x09\x0b\x0c\x0e-\x7f]) * )@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"
$EmailAddresses += (($data.Links).Href | ? { $_ -match $regex }).Replace("mailto:", "")
And voila! For my next trick, I’ll try getting the parsed HTML content for email addresses that aren’t hot. But for now, this can be pretty useful.
# Basic PowerShell scraper
param (
[System.Management.Automation.PSCredential]$Credential,
[array]$DocumentTypes = @("pdf","doc","docx","xls","xlsx","xlsm","xlsxm","ppt","pptx","jpg","gif","mp4","m4v","mp3","mov","avi","wmv","wma"),
[string]$OutputPath = "C:\Temp\",
[ValidateSet('DocumentDownloadScrape','EmailAddressScrape')]$ScrapeType = 'DocumentScrape',
[switch]$UseCredentialAsPSCred,
[Parameter(Mandatory = $true)][string]$Site
)
If ($Credential)
{
$Username = $Credential.UserName
$Password = $Credential.GetNetworkCredential().Password
$CredentialString = "$($Username):$($Password)"
$CredentialEncoded = [System.Convert]::ToBase64String([System.Text.Encoding]::ASCII.GetBytes($CredentialString))
$BasicAuthValue = "Basic $($CredentialEncoded)"
$Headers = @{ Authorization = $BasicAuthValue }
}
try { $data = Invoke-WebRequest -Uri $site }
catch { "unable to gather data from $($site)" }
If (!(Test-Path $OutputPath))
{
$FolderResult = New-Item -Path $OutputPath -Type Directory -Force
}
$OutputPath = $OutputPath.TrimEnd("\")
if ($data)
{
switch ($ScrapeType)
{
DocumentDownloadScrape {
[array]$Links = @()
$Links += ($data.Links).Href
$Filter = '(?i)(' + (($DocumentTypes | % { [regex]::escape($_) }) -join "|") + ')$'
[array]$FilesToDownload = $Links -match $Filter
$i = 1
$iTotal = $FilesToDownload.Count
foreach ($File in $FilesToDownload)
{
$Filename = Split-Path $File -Leaf
$OutputFile = Join-Path $OutputPath -ChildPath $Filename
Write-Progress -Activity "Downloading $($File)." -PercentComplete (($i/$iTotal) * 100) -Id 1 -Status "File $($i) of $($iTotal)"
$params = @{ }
$params.Add('Uri', $File)
$params.Add('OutFile', $OutputFile)
If ($Credential)
{
If ($UseCredentialAsPSCred -and $Headers)
{
$params.add('Headers', $Headers)
}
Else
{
$params.Add('Credential', $Credential)
}
}
try { Invoke-WebRequest @params }
catch { Write-Progress -Status "Error downloading $($File)." -Activity "Downloading $($File)." }
$i++
}
Write-Progress -Activity "Finished." -Completed
}
EmailAddressScrape {
[array]$EmailAddresses = @()
$regex = "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f] | \\[\x01-\x09\x0b\x0c\x0e-\x7f]) * )@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"
$EmailAddresses += (($data.Links).Href | ? { $_ -match $regex }).Replace("mailto:", "")
If ($EmailAddresses.Count -ge 1)
{
$OutputFile = Join-Path $OutputPath -ChildPath "EmailAddresses.txt"
$EmailAddresses | Out-File -FilePath $OutputFile
Write-Host -Fore Green "Finished scraping $($Site). Found $($EmailAddresses.Count) emailaddresses and saved to $($OutputFile.ToString())."
}
Else
{
Write-Host -Fore Yellow "Finished scraping $($Site) but didn't find any email addresses."
}
}
}
}
Scrape early, scrape often!

