Web Scraping Exercises for Beginners using Python

Web Scraping Exercises for Beginners – Solutions with Explanations


Exercise 1: Basic HTML Page Scraping

from bs4 import BeautifulSoup

html_content = """
<html>
<head><title>Exercise 1</title></head>
<body>
<h1>Welcome to Web Scraping</h1>
<p>This is the first paragraph.</p>
<p>This is the second paragraph.</p>
<div>This is a div, not a paragraph.</div>
</body>
</html>
"""

soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('title').text
paragraphs = [p.text for p in soup.find_all('p')]

print("Title:", title)
print("Paragraphs:", paragraphs)

Explanation:

  • BeautifulSoup parses the HTML text.
  • .find('title') fetches the first <title> tag.
  • .find_all('p') collects all <p> tags into a list.
  • We only get paragraphs, not the <div>.

Exercise 2: Extract Links from a Webpage

from bs4 import BeautifulSoup
import requests

def extract_links(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a.get('href') for a in soup.find_all('a') if a.get('href')]
        return links
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []

# Test
links = extract_links('https://httpbin.org/links/10')
print("Links found:", links)

Explanation:

  • Uses requests to fetch a live webpage.
  • .find_all('a') grabs all anchor tags.
  • .get('href') extracts the URL inside <a href="...">.
  • Returns only valid links.

Exercise 3: Scrape Data from a Table

from bs4 import BeautifulSoup

html_table = """
<table>
<tr><th>Name</th><th>Age</th><th>City</th></tr>
<tr><td>John</td><td>25</td><td>New York</td></tr>
<tr><td>Alice</td><td>30</td><td>London</td></tr>
<tr><td>Bob</td><td>35</td><td>Paris</td></tr>
</table>
"""

soup = BeautifulSoup(html_table, 'html.parser')
table_data = []

headers = [th.text for th in soup.find_all('th')]

for row in soup.find_all('tr')[1:]:  # Skip header row
    cells = row.find_all('td')
    if cells:
        row_data = {}
        for i, cell in enumerate(cells):
            row_data[headers[i]] = cell.text
        table_data.append(row_data)

print("Table data:", table_data)

Explanation:

  • Extracts column headers (<th>).

  • Loops through each row (<tr>), skipping the header.

  • Matches each <td> cell with the corresponding header.

  • Creates structured dictionaries → makes table usable like a dataset.


Exercise 4: Scrape Product Information

from bs4 import BeautifulSoup

html_product = """
<div class="product">
    <h2 class="product-name">Laptop</h2>
    <p class="description">A powerful laptop for work and play</p>
    <div class="price">$999.99</div>
    <span class="rating">4.5 stars</span>
</div>
"""

soup = BeautifulSoup(html_product, 'html.parser')

product_info = {
    'name': soup.find('h2', class_='product-name').text if soup.find('h2', class_='product-name') else 'N/A',
    'description': soup.find('p', class_='description').text if soup.find('p', class_='description') else 'N/A',
    'price': soup.find('div', class_='price').text if soup.find('div', class_='price') else 'N/A',
    'rating': soup.find('span', class_='rating').text if soup.find('span', class_='rating') else 'N/A'
}

print("Product information:", product_info)

Explanation:

  • Each product attribute is wrapped inside a specific class (e.g., "product-name").

  • .find(tag, class_="...") locates the element.

  • Uses conditional checks so code won’t break if a field is missing.


Exercise 5: Pagination Handling

from bs4 import BeautifulSoup
import requests
import time

def scrape_multiple_pages(base_url, pages):
    all_data = []
    
    for page in range(1, pages + 1):
        try:
            url = f"{base_url}?page={page}"
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            paragraphs = [p.text for p in soup.find_all('p')]
            all_data.extend(paragraphs)
            
            print(f"Scraped page {page}")
            time.sleep(1)  # avoid hammering server
        except requests.exceptions.RequestException as e:
            print(f"Error scraping page {page}: {e}")
    
    return all_data

Explanation:

  • Loops through multiple pages by changing ?page= parameter.

  • Scrapes each page and adds results to a single list.

  • Adds time.sleep(1) → prevents server overload (polite scraping).


Exercise 6: Extract Email Addresses

import re
import requests

def extract_emails(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, response.text)
        return emails
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return []

Explanation:

  • Uses regex pattern to detect emails in page text.

  • Captures formats like abc@xyz.com.

  • Useful for crawling directories, but must be used ethically.


Exercise 7: Scrape Images

import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import time

def download_images(url, save_path='images'):
    try:
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        images = soup.find_all('img')
        
        downloaded_images = []
        
        for img in images:
            img_url = img.get('src')
            if img_url:
                img_url = urljoin(url, img_url)
                img_name = os.path.basename(img_url)
                
                img_response = requests.get(img_url, stream=True)
                if img_response.status_code == 200:
                    with open(os.path.join(save_path, img_name), 'wb') as f:
                        for chunk in img_response.iter_content(1024):
                            f.write(chunk)
                    downloaded_images.append(img_name)
                    print(f"Downloaded: {img_name}")
                time.sleep(0.5)
        
        return downloaded_images
    except Exception as e:
        print(f"Error: {e}")
        return []

Explanation:

  • Extracts all <img> tags.

  • Converts relative src paths to absolute URLs (urljoin).

  • Downloads each image → saves locally in a folder.

  • Adds delay between downloads (avoid being blocked).


Exercise 8: Handle Dynamic Content

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def scrape_dynamic_content(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        
        page_content = driver.page_source
        links = driver.find_elements(By.TAG_NAME, "a")
        link_urls = [link.get_attribute('href') for link in links if link.get_attribute('href')]
    finally:
        driver.quit()
    
    return page_content, link_urls

Explanation:

  • Uses Selenium for sites where content loads via JavaScript.

  • Waits for elements with WebDriverWait.

  • Simulates scrolling → triggers infinite scroll.

  • Captures dynamic content beyond plain HTML.


Exercise 9: API Data Extraction

import requests
import json

def extract_from_api(api_url):
    try:
        response = requests.get(api_url)
        response.raise_for_status()
        data = response.json()
        
        processed_data = []
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    processed_item = {}
                    if 'title' in item:
                        processed_item['title'] = item['title']
                    if 'id' in item:
                        processed_item['id'] = item['id']
                    processed_data.append(processed_item)
        elif isinstance(data, dict):
            processed_data.append(data)
        
        return processed_data
    except Exception as e:
        print(f"Error: {e}")
        return []

Explanation:

  • Instead of scraping HTML, pulls structured JSON data from an API endpoint.

  • Parses with .json() → already clean.

  • Faster and cleaner than scraping HTML.


Exercise 10: Scrape News Headlines

from bs4 import BeautifulSoup
import requests

def scrape_news_headlines(url, max_headlines=10):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        headlines = []
        selectors = ['h1', 'h2', 'h3', '.headline', '.title']
        
        for selector in selectors:
            elements = soup.select(selector)
            for element in elements:
                text = element.get_text().strip()
                if text and len(text) > 10 and text not in headlines:
                    headlines.append(text)
                    if len(headlines) >= max_headlines:
                        break
            if len(headlines) >= max_headlines:
                break
        
        return headlines
    except Exception as e:
        print(f"Error: {e}")
        return []

Explanation:

  • Uses multiple CSS selectors (h1, .headline, etc.) to find headlines.

  • Strips text, removes duplicates.

  • Limits to max_headlines for clean results.



0 Comments