Web Scraping Exercises for Beginners – Solutions with Explanations
Exercise 1: Basic HTML Page Scraping
from bs4 import BeautifulSoup
html_content = """
<html>
<head><title>Exercise 1</title></head>
<body>
<h1>Welcome to Web Scraping</h1>
<p>This is the first paragraph.</p>
<p>This is the second paragraph.</p>
<div>This is a div, not a paragraph.</div>
</body>
</html>
"""
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('title').text
paragraphs = [p.text for p in soup.find_all('p')]
print("Title:", title)
print("Paragraphs:", paragraphs)
Explanation:
-
BeautifulSoup
parses the HTML text. -
.find('title')
fetches the first<title>
tag. -
.find_all('p')
collects all<p>
tags into a list. -
We only get paragraphs, not the
<div>
.
Exercise 2: Extract Links from a Webpage
from bs4 import BeautifulSoup
import requests
def extract_links(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
links = [a.get('href') for a in soup.find_all('a') if a.get('href')]
return links
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return []
# Test
links = extract_links('https://httpbin.org/links/10')
print("Links found:", links)
Explanation:
-
Uses
requests
to fetch a live webpage. -
.find_all('a')
grabs all anchor tags. -
.get('href')
extracts the URL inside<a href="...">
. - Returns only valid links.
Exercise 3: Scrape Data from a Table
from bs4 import BeautifulSoup
html_table = """
<table>
<tr><th>Name</th><th>Age</th><th>City</th></tr>
<tr><td>John</td><td>25</td><td>New York</td></tr>
<tr><td>Alice</td><td>30</td><td>London</td></tr>
<tr><td>Bob</td><td>35</td><td>Paris</td></tr>
</table>
"""
soup = BeautifulSoup(html_table, 'html.parser')
table_data = []
headers = [th.text for th in soup.find_all('th')]
for row in soup.find_all('tr')[1:]: # Skip header row
cells = row.find_all('td')
if cells:
row_data = {}
for i, cell in enumerate(cells):
row_data[headers[i]] = cell.text
table_data.append(row_data)
print("Table data:", table_data)
Explanation:
-
Extracts column headers (
<th>
). -
Loops through each row (
<tr>
), skipping the header. -
Matches each
<td>
cell with the corresponding header. -
Creates structured dictionaries → makes table usable like a dataset.
Exercise 4: Scrape Product Information
from bs4 import BeautifulSoup
html_product = """
<div class="product">
<h2 class="product-name">Laptop</h2>
<p class="description">A powerful laptop for work and play</p>
<div class="price">$999.99</div>
<span class="rating">4.5 stars</span>
</div>
"""
soup = BeautifulSoup(html_product, 'html.parser')
product_info = {
'name': soup.find('h2', class_='product-name').text if soup.find('h2', class_='product-name') else 'N/A',
'description': soup.find('p', class_='description').text if soup.find('p', class_='description') else 'N/A',
'price': soup.find('div', class_='price').text if soup.find('div', class_='price') else 'N/A',
'rating': soup.find('span', class_='rating').text if soup.find('span', class_='rating') else 'N/A'
}
print("Product information:", product_info)
Explanation:
-
Each product attribute is wrapped inside a specific class (e.g.,
"product-name"
). -
.find(tag, class_="...")
locates the element. -
Uses conditional checks so code won’t break if a field is missing.
Exercise 5: Pagination Handling
from bs4 import BeautifulSoup
import requests
import time
def scrape_multiple_pages(base_url, pages):
all_data = []
for page in range(1, pages + 1):
try:
url = f"{base_url}?page={page}"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = [p.text for p in soup.find_all('p')]
all_data.extend(paragraphs)
print(f"Scraped page {page}")
time.sleep(1) # avoid hammering server
except requests.exceptions.RequestException as e:
print(f"Error scraping page {page}: {e}")
return all_data
Explanation:
-
Loops through multiple pages by changing
?page=
parameter. -
Scrapes each page and adds results to a single list.
-
Adds
time.sleep(1)
→ prevents server overload (polite scraping).
Exercise 6: Extract Email Addresses
import re
import requests
def extract_emails(url):
try:
response = requests.get(url)
response.raise_for_status()
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, response.text)
return emails
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return []
Explanation:
-
Uses regex pattern to detect emails in page text.
-
Captures formats like
abc@xyz.com
. -
Useful for crawling directories, but must be used ethically.
Exercise 7: Scrape Images
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import time
def download_images(url, save_path='images'):
try:
if not os.path.exists(save_path):
os.makedirs(save_path)
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
images = soup.find_all('img')
downloaded_images = []
for img in images:
img_url = img.get('src')
if img_url:
img_url = urljoin(url, img_url)
img_name = os.path.basename(img_url)
img_response = requests.get(img_url, stream=True)
if img_response.status_code == 200:
with open(os.path.join(save_path, img_name), 'wb') as f:
for chunk in img_response.iter_content(1024):
f.write(chunk)
downloaded_images.append(img_name)
print(f"Downloaded: {img_name}")
time.sleep(0.5)
return downloaded_images
except Exception as e:
print(f"Error: {e}")
return []
Explanation:
-
Extracts all
<img>
tags. -
Converts relative
src
paths to absolute URLs (urljoin
). -
Downloads each image → saves locally in a folder.
-
Adds delay between downloads (avoid being blocked).
Exercise 8: Handle Dynamic Content
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
def scrape_dynamic_content(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get(url)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
page_content = driver.page_source
links = driver.find_elements(By.TAG_NAME, "a")
link_urls = [link.get_attribute('href') for link in links if link.get_attribute('href')]
finally:
driver.quit()
return page_content, link_urls
Explanation:
-
Uses Selenium for sites where content loads via JavaScript.
-
Waits for elements with
WebDriverWait
. -
Simulates scrolling → triggers infinite scroll.
-
Captures dynamic content beyond plain HTML.
Exercise 9: API Data Extraction
import requests
import json
def extract_from_api(api_url):
try:
response = requests.get(api_url)
response.raise_for_status()
data = response.json()
processed_data = []
if isinstance(data, list):
for item in data:
if isinstance(item, dict):
processed_item = {}
if 'title' in item:
processed_item['title'] = item['title']
if 'id' in item:
processed_item['id'] = item['id']
processed_data.append(processed_item)
elif isinstance(data, dict):
processed_data.append(data)
return processed_data
except Exception as e:
print(f"Error: {e}")
return []
Explanation:
-
Instead of scraping HTML, pulls structured JSON data from an API endpoint.
-
Parses with
.json()
→ already clean. -
Faster and cleaner than scraping HTML.
Exercise 10: Scrape News Headlines
from bs4 import BeautifulSoup
import requests
def scrape_news_headlines(url, max_headlines=10):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
response = requests.get(url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
headlines = []
selectors = ['h1', 'h2', 'h3', '.headline', '.title']
for selector in selectors:
elements = soup.select(selector)
for element in elements:
text = element.get_text().strip()
if text and len(text) > 10 and text not in headlines:
headlines.append(text)
if len(headlines) >= max_headlines:
break
if len(headlines) >= max_headlines:
break
return headlines
except Exception as e:
print(f"Error: {e}")
return []
Explanation:
-
Uses multiple CSS selectors (
h1
,.headline
, etc.) to find headlines. -
Strips text, removes duplicates.
-
Limits to
max_headlines
for clean results.
0 Comments