Last modified: Nov 10, 2023 By Alexander Williams

Python: Extract Email Addresses from a Website

Table Of Contents

Using BeautifulSoup for HTML Parsing
Using Scrapy for Web Scraping
Using Selenium for Dynamic Content

Using BeautifulSoup for HTML Parsing


# Import necessary libraries
from bs4 import BeautifulSoup
import requests

# URL of the web page to scrape
url = "https://example.com"

# Make a request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all email addresses in the parsed HTML
email_addresses = []
for a in soup.find_all('a', href=True):
    if 'mailto:' in a['href']:
        email_addresses.append(a['href'][7:])

# Print the extracted email addresses
print(email_addresses)

Output:


['info@example.com', 'support@example.com']

Using Scrapy for Web Scraping


# Install Scrapy using: pip install scrapy

# Create a Scrapy spider to extract email addresses
import scrapy

class EmailSpider(scrapy.Spider):
    name = 'email_spider'
    start_urls = ['https://example.com']

    def parse(self, response):
        # Extract email addresses using XPath
        email_addresses = response.xpath('//a[starts-with(@href, "mailto:")]/@href').extract()
        print(email_addresses)

# Run the spider
# scrapy runspider spider_name.py

Output:


['mailto:info@example.com', 'mailto:support@example.com']

Using Selenium for Dynamic Content


# Install Selenium using: pip install selenium

# Import necessary libraries
from selenium import webdriver

# URL of the web page to scrape
url = "https://example.com"

# Initialize WebDriver (make sure to have the appropriate driver installed)
driver = webdriver.Chrome()
driver.get(url)

# Extract email addresses from dynamically loaded content
email_elements = driver.find_elements_by_xpath('//a[starts-with(@href, "mailto:")]')
email_addresses = [element.get_attribute("href")[7:] for element in email_elements]

# Print the extracted email addresses
print(email_addresses)

# Close the browser
driver.quit()

Output:


['info@example.com', 'support@example.com']

Python: Extract Email Addresses from a Website

Using BeautifulSoup for HTML Parsing

Using Scrapy for Web Scraping

Using Selenium for Dynamic Content

Related Tutorials:

Recent Tutorials:

Privacy Preferences