Web Scrapper

Python code that implements a basic web scraper with multiple features, such as data extraction from websites, storage in a structured format, and error handling.

By Aman KureshiDec 18, 2025 (Dec 18, 2025)

Web Scrapper

python

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
from random import randint

# Function to fetch the HTML content of a web page
def fetch_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error if status code is not 200
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to parse HTML and extract data
def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Assuming we're scraping articles with a title and content
    articles = soup.find_all('article')
    data = []
    
    for article in articles:
        title = article.find('h2').get_text(strip=True) if article.find('h2') else 'No Title'
        content = article.find('div', class_='content').get_text(strip=True) if article.find('div', class_='content') else 'No Content'
        
        # Collect the data in a dictionary
        data.append({
            'title': title,
            'content': content
        })
    
    return data

# Function to save data into a CSV file
def save_to_csv(data, filename='scraped_data.csv'):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

# Function to create the directory if it does not exist
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Function to scrape multiple pages
def scrape_multiple_pages(base_url, num_pages):
    all_data = []
    
    for page_num in range(1, num_pages + 1):
        url = f"{base_url}?page={page_num}"
        print(f"Scraping page {page_num}...")
        
        html = fetch_html(url)
        
        if html:
            page_data = parse_html(html)
            all_data.extend(page_data)
        
        # Wait a random time to avoid rate limiting (between 1-3 seconds)
        time.sleep(randint(1, 3))
    
    return all_data

# Main function to initiate the scraper
def main():
    base_url = 'https://example.com/articles'  # Replace with actual base URL
    num_pages = 5  # Number of pages to scrape
    
    # Create a directory for saving data
    create_directory('scraped_data')
    
    print("Starting the scraper...")
    
    # Scrape data from multiple pages
    all_data = scrape_multiple_pages(base_url, num_pages)
    
    # Save the scraped data to CSV
    if all_data:
        save_to_csv(all_data, 'scraped_data/articles.csv')
    else:
        print("No data scraped.")

# Entry point of the script
if __name__ == "__main__":
    main()

Views

Lines

Characters

2,838

Likes

Details

Language: Python
Created: Dec 18, 2025
Updated: Dec 18, 2025
Size: 2.8 KB

Build your snippet library

Join thousands of developers organizing and sharing code snippets.