1. Install dependencies
#E8A0BF">pip #E8A0BF">install pandas requests beautifulsoup4 lxml2. Scraping HTML tables with read_html()
If your target data is in an HTML table, pandas can scrape it in one line:
#E8A0BF">import pandas #E8A0BF">as pd
# pandas can scrape HTML tables directly
url = #A8D4A0">"https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
tables = pd.read_html(url)
# Get the first table
df = tables[0]
#E8A0BF">print(f#A8D4A0">"Found {len(tables)} tables")
#E8A0BF">print(df.head(10))
# Clean #E8A0BF">and save
df.columns = df.columns.droplevel(0) # flatten multi-index
df.to_csv(#A8D4A0">"gdp_data.csv", index=#E8A0BF">False)3. BeautifulSoup to DataFrame
For non-table data, use BeautifulSoup to extract elements into a list of dicts, then create a DataFrame:
#E8A0BF">import requests
#E8A0BF">from bs4 #E8A0BF">import BeautifulSoup
#E8A0BF">import pandas #E8A0BF">as pd
url = #A8D4A0">"https://books.toscrape.com/"
response = requests.get(url, headers={
#A8D4A0">"User-Agent": #A8D4A0">"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
})
soup = BeautifulSoup(response.text, #A8D4A0">"lxml")
books = []
#E8A0BF">for article #E8A0BF">in soup.select(#A8D4A0">"article.product_pod"):
books.append({
#A8D4A0">"title": article.select_one(#A8D4A0">"h3 a")[#A8D4A0">"title"],
#A8D4A0">"price": article.select_one(#A8D4A0">".price_color").text.strip(),
#A8D4A0">"rating": article.select_one(#A8D4A0">"p.star-rating")[#A8D4A0">"#E8A0BF">class"][1],
#A8D4A0">"in_stock": #A8D4A0">"In stock" #E8A0BF">in article.select_one(
#A8D4A0">".availability"
).text
})
# Straight into a DataFrame
df = pd.DataFrame(books)
#E8A0BF">print(df)4. Cleaning scraped data
Raw scraped data is always messy. Here are the essential cleaning steps:
#E8A0BF">import pandas #E8A0BF">as pd
# Load raw scraped data
df = pd.DataFrame(scraped_data)
# 1. Remove duplicates
df = df.drop_duplicates(subset=[#A8D4A0">"title"])
# 2. Clean price column: #A8D4A0">"$29.99" -> 29.99
df[#A8D4A0">"price"] = (
df[#A8D4A0">"price"]
.str.replace(#A8D4A0">"$", #A8D4A0">"", regex=#E8A0BF">False)
.str.replace(#A8D4A0">",", #A8D4A0">"", regex=#E8A0BF">False)
.astype(float)
)
# 3. Clean whitespace
df[#A8D4A0">"title"] = df[#A8D4A0">"title"].str.strip()
# 4. Handle missing values
df[#A8D4A0">"rating"] = df[#A8D4A0">"rating"].fillna(0).astype(float)
# 5. Add computed columns
df[#A8D4A0">"price_tier"] = pd.cut(
df[#A8D4A0">"price"],
bins=[0, 10, 25, 50, 100, float(#A8D4A0">"inf")],
labels=[#A8D4A0">"Budget", #A8D4A0">"Low", #A8D4A0">"Mid", #A8D4A0">"High", #A8D4A0">"Premium"]
)
#E8A0BF">print(df.info())
#E8A0BF">print(df.describe())5. Analyzing scraped data
Once cleaned, use pandas aggregation and groupby to find insights:
#E8A0BF">import pandas #E8A0BF">as pd
# Analyze scraped product data
#E8A0BF">print(#A8D4A0">"=== Price Analysis ===")
#E8A0BF">print(f#A8D4A0">"Mean price: $" + f#A8D4A0">"{df[#A8D4A0">'price'].mean():.2f}")
#E8A0BF">print(f#A8D4A0">"Median price: $" + f#A8D4A0">"{df[#A8D4A0">'price'].median():.2f}")
#E8A0BF">print(f#A8D4A0">"Std dev: $" + f#A8D4A0">"{df[#A8D4A0">'price'].std():.2f}")
# Group by category
category_stats = df.groupby(#A8D4A0">"category").agg(
count=(#A8D4A0">"title", #A8D4A0">"count"),
avg_price=(#A8D4A0">"price", #A8D4A0">"mean"),
avg_rating=(#A8D4A0">"rating", #A8D4A0">"mean")
).sort_values(#A8D4A0">"count", ascending=#E8A0BF">False)
#E8A0BF">print(#A8D4A0">"\n=== Top Categories ===")
#E8A0BF">print(category_stats.head(10))
# Find best value items (high rating, low price)
df[#A8D4A0">"value_score"] = df[#A8D4A0">"rating"] / df[#A8D4A0">"price"]
best_value = df.nlargest(10, #A8D4A0">"value_score")[
[#A8D4A0">"title", #A8D4A0">"price", #A8D4A0">"rating", #A8D4A0">"value_score"]
]
#E8A0BF">print(#A8D4A0">"\n=== Best Value Items ===")
#E8A0BF">print(best_value)
# Export analysis
df.to_csv(#A8D4A0">"cleaned_products.csv", index=#E8A0BF">False)
df.to_json(#A8D4A0">"cleaned_products.json", orient=#A8D4A0">"records", indent=2)
df.to_excel(#A8D4A0">"cleaned_products.xlsx", index=#E8A0BF">False)6. Handling pagination
Most sites paginate their data. Loop through pages and collect everything into a single DataFrame:
#E8A0BF">import requests
#E8A0BF">from bs4 #E8A0BF">import BeautifulSoup
#E8A0BF">import pandas #E8A0BF">as pd
#E8A0BF">import time
#E8A0BF">def scrape_all_pages(base_url, max_pages=50):
all_items = []
#E8A0BF">for page #E8A0BF">in range(1, max_pages + 1):
url = f#A8D4A0">"{base_url}?page={page}"
#E8A0BF">try:
resp = requests.get(url, headers={
#A8D4A0">"User-Agent": #A8D4A0">"Mozilla/5.0"
}, timeout=10)
#E8A0BF">if resp.status_code != 200:
#E8A0BF">print(f#A8D4A0">"Stopped at page {page}: HTTP {resp.status_code}")
break
soup = BeautifulSoup(resp.text, #A8D4A0">"lxml")
items = soup.select(#A8D4A0">".product-card")
#E8A0BF">if #E8A0BF">not items:
#E8A0BF">print(f#A8D4A0">"No items on page {page}, stopping")
break
#E8A0BF">for item #E8A0BF">in items:
all_items.append({
#A8D4A0">"name": item.select_one(#A8D4A0">".name").text.strip(),
#A8D4A0">"price": item.select_one(#A8D4A0">".price").text.strip(),
})
#E8A0BF">print(f#A8D4A0">"Page {page}: {len(items)} items")
time.sleep(1.5) # polite delay
#E8A0BF">except Exception #E8A0BF">as e:
#E8A0BF">print(f#A8D4A0">"Error on page {page}: {e}")
continue
#E8A0BF">return pd.DataFrame(all_items)
df = scrape_all_pages(#A8D4A0">"https://example.com/products")
#E8A0BF">print(f#A8D4A0">"Total: {len(df)} items scraped")
df.to_csv(#A8D4A0">"all_products.csv", index=#E8A0BF">False)7. JavaScript pages with SnapRender
Many modern sites render data with JavaScript. requests + BeautifulSoup cannot execute JS. Use SnapRender to extract structured data from JS-rendered pages, then load it into pandas:
#E8A0BF">import requests
#E8A0BF">import pandas #E8A0BF">as pd
API_KEY = #A8D4A0">"sr_live_YOUR_KEY"
#E8A0BF">def scrape_with_snaprender(url):
#A8D4A0">""#A8D4A0">"Scrape a JS-rendered page via SnapRender"#A8D4A0">""
resp = requests.post(
#A8D4A0">"https://api.snaprender.dev/v1/extract",
headers={
#A8D4A0">"x-api-key": API_KEY,
#A8D4A0">"Content-Type": #A8D4A0">"application/json"
},
json={
#A8D4A0">"url": url,
#A8D4A0">"selectors": {
#A8D4A0">"names": #A8D4A0">"h2.product-name",
#A8D4A0">"prices": #A8D4A0">".price-current",
#A8D4A0">"ratings": #A8D4A0">".star-rating",
#A8D4A0">"categories": #A8D4A0">".breadcrumb li:last-child"
}
}
)
#E8A0BF">return resp.json()[#A8D4A0">"data"]
# Scrape multiple pages
urls = [f#A8D4A0">"https://example.com/products?page={i}" #E8A0BF">for i #E8A0BF">in range(1, 11)]
all_data = []
#E8A0BF">for url #E8A0BF">in urls:
data = scrape_with_snaprender(url)
# Zip the parallel arrays into rows
#E8A0BF">for name, price, rating, cat #E8A0BF">in zip(
data[#A8D4A0">"names"], data[#A8D4A0">"prices"],
data[#A8D4A0">"ratings"], data[#A8D4A0">"categories"]
):
all_data.append({
#A8D4A0">"name": name, #A8D4A0">"price": price,
#A8D4A0">"rating": rating, #A8D4A0">"category": cat
})
# Into pandas #E8A0BF">for cleaning + analysis
df = pd.DataFrame(all_data)
df[#A8D4A0">"price"] = df[#A8D4A0">"price"].str.replace(#A8D4A0">"$", #A8D4A0">"").astype(float)
df[#A8D4A0">"rating"] = df[#A8D4A0">"rating"].astype(float)
#E8A0BF">print(f#A8D4A0">"Scraped {len(df)} products")
#E8A0BF">print(df.describe())Scrape any page into a DataFrame
SnapRender handles JavaScript rendering and anti-bot bypass. Extract structured data with CSS selectors and pipe it straight into pandas.
Get Your API Key — FreeFrequently asked questions
pandas has a built-in read_html() function that extracts HTML tables from a URL. However, it only works on static pages with <table> elements. For JavaScript-rendered pages or non-table data, you need requests + BeautifulSoup or the SnapRender API to fetch the data first, then load it into a DataFrame.
Start with dropna() to remove empty rows, str.strip() to clean whitespace, astype() to fix data types, and str.replace() for formatting (like removing $ from prices). Use apply() with custom functions for complex transformations. Always validate with df.info() and df.describe() after cleaning.
Scrape each page into a list of dictionaries, then create a single DataFrame with pd.DataFrame(all_results). Alternatively, create a DataFrame per page and concatenate them with pd.concat(). The latter uses more memory but is easier to debug.
pandas works well for datasets up to a few million rows. For larger datasets, consider chunked processing with chunksize parameter, or switch to Polars (Rust-based, faster). For the scraping step itself, use async libraries (aiohttp) or the SnapRender API for parallelism.
Yes. Scrape your data into a list of dictionaries, then call pd.DataFrame(data). Each dictionary becomes a row, and keys become column names. This is the cleanest pattern for web scraping with pandas.