Last modified: Nov 10, 2023 By Alexander Williams
Python: Extract Email Addresses from a Website
Table Of Contents
Using BeautifulSoup for HTML Parsing
# Import necessary libraries
from bs4 import BeautifulSoup
import requests
# URL of the web page to scrape
url = "https://example.com"
# Make a request to the URL
response = requests.get(url)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find all email addresses in the parsed HTML
email_addresses = []
for a in soup.find_all('a', href=True):
if 'mailto:' in a['href']:
email_addresses.append(a['href'][7:])
# Print the extracted email addresses
print(email_addresses)
Output:
['info@example.com', 'support@example.com']
Using Scrapy for Web Scraping
# Install Scrapy using: pip install scrapy
# Create a Scrapy spider to extract email addresses
import scrapy
class EmailSpider(scrapy.Spider):
name = 'email_spider'
start_urls = ['https://example.com']
def parse(self, response):
# Extract email addresses using XPath
email_addresses = response.xpath('//a[starts-with(@href, "mailto:")]/@href').extract()
print(email_addresses)
# Run the spider
# scrapy runspider spider_name.py
Output:
['mailto:info@example.com', 'mailto:support@example.com']
Using Selenium for Dynamic Content
# Install Selenium using: pip install selenium
# Import necessary libraries
from selenium import webdriver
# URL of the web page to scrape
url = "https://example.com"
# Initialize WebDriver (make sure to have the appropriate driver installed)
driver = webdriver.Chrome()
driver.get(url)
# Extract email addresses from dynamically loaded content
email_elements = driver.find_elements_by_xpath('//a[starts-with(@href, "mailto:")]')
email_addresses = [element.get_attribute("href")[7:] for element in email_elements]
# Print the extracted email addresses
print(email_addresses)
# Close the browser
driver.quit()
Output:
['info@example.com', 'support@example.com']