What this builds
A Python script that takes a sitemap URL, discovers all pages, extracts SEO-critical data from each one, and flags common issues like missing titles, thin content, and canonical mismatches.
Parse sitemap.xml
Handle both regular sitemaps and sitemap indexes. Extract all URLs recursively.
Render & extract
Use SnapRender /extract to pull title, H1, meta description, canonical, and word count from each page.
Export to CSV
Save all data to a CSV file for analysis in Google Sheets, Excel, or pandas.
Flag SEO issues
Automatically detect missing titles, thin content, canonical mismatches, and other problems.
Step 1: Parse the sitemap
Fetch and parse sitemap.xml using Python's built-in XML parser. This function handles both regular sitemaps and sitemap indexes (which point to other sitemaps).
#E8A0BF">import requests
#E8A0BF">import xml.etree.ElementTree #E8A0BF">as ET
#E8A0BF">def get_urls_from_sitemap(sitemap_url):
#A8D4A0">""#A8D4A0">"Parse sitemap.xml #E8A0BF">and #E8A0BF">return list of URLs."#A8D4A0">""
resp = requests.#87CEEB">get(sitemap_url)
root = ET.#87CEEB">fromstring(resp.#87CEEB">content)
# Handle namespace
ns = {#A8D4A0">'sm': #A8D4A0">'http://www.sitemaps.org/schemas/sitemap/0.9'}
urls = []
# Check #E8A0BF">if this is a sitemap index
sitemaps = root.#87CEEB">findall(#A8D4A0">'sm:sitemap', ns)
#E8A0BF">if sitemaps:
# It#A8D4A0">'s a sitemap index — recurse into each child
#E8A0BF">for sitemap #E8A0BF">in sitemaps:
loc = sitemap.#87CEEB">find('sm:loc#A8D4A0">', ns)
#E8A0BF">if loc is #E8A0BF">not #E8A0BF">None:
urls.extend(get_urls_from_sitemap(loc.#87CEEB">text))
#E8A0BF">else:
# It's a regular sitemap
#E8A0BF">for url_elem #E8A0BF">in root.#87CEEB">findall(#A8D4A0">'sm:url', ns):
loc = url_elem.#87CEEB">find(#A8D4A0">'sm:loc', ns)
#E8A0BF">if loc is #E8A0BF">not #E8A0BF">None:
urls.#87CEEB">append(loc.#87CEEB">text)
#E8A0BF">return urls
# Example
urls = get_urls_from_sitemap(#A8D4A0">"https://example.com/sitemap.xml")
#E8A0BF">print(#A8D4A0">"Found " + str(#E8A0BF">len(urls)) + #A8D4A0">" URLs")Step 2: Extract page data
For each URL, use SnapRender's /extract endpoint to pull the title, H1, meta description, canonical URL, and body text (for word count). The extract_attributes parameter gets attribute values (like content) instead of text content.
API_KEY = #A8D4A0">"sr_live_YOUR_KEY"
#E8A0BF">def extract_page_data(url):
#A8D4A0">""#A8D4A0">"Render page #E8A0BF">and extract SEO-relevant data."#A8D4A0">""
resp = requests.#87CEEB">post(
#A8D4A0">"https://api.snaprender.dev/v1/extract",
headers={#A8D4A0">"x-api-key": API_KEY},
json={
#A8D4A0">"url": url,
#A8D4A0">"selectors": {
#A8D4A0">"title": #A8D4A0">"title",
#A8D4A0">"h1": #A8D4A0">"h1",
#A8D4A0">"meta_desc": #A8D4A0">'meta[name=#A8D4A0">"description"]',
#A8D4A0">"canonical": #A8D4A0">'link[rel=#A8D4A0">"canonical"]',
#A8D4A0">"word_count": #A8D4A0">"body",
},
#A8D4A0">"extract_attributes": {
#A8D4A0">"meta_desc": #A8D4A0">"content",
#A8D4A0">"canonical": #A8D4A0">"href",
}
}
)
#E8A0BF">if resp.#87CEEB">status_code != 200:
#E8A0BF">return {#A8D4A0">"url": url, #A8D4A0">"error": resp.#87CEEB">status_code}
data = resp.#87CEEB">json().#87CEEB">get(#A8D4A0">"data", {})
body_text = data.#87CEEB">get(#A8D4A0">"word_count", #A8D4A0">"")
word_count = #E8A0BF">len(body_text.split()) #E8A0BF">if body_text #E8A0BF">else 0
#E8A0BF">return {
#A8D4A0">"url": url,
#A8D4A0">"title": data.#87CEEB">get(#A8D4A0">"title", #A8D4A0">""),
#A8D4A0">"h1": data.#87CEEB">get(#A8D4A0">"h1", #A8D4A0">""),
#A8D4A0">"meta_description": data.#87CEEB">get(#A8D4A0">"meta_desc", #A8D4A0">""),
#A8D4A0">"canonical": data.#87CEEB">get(#A8D4A0">"canonical", #A8D4A0">""),
#A8D4A0">"word_count": word_count,
}Step 3: Run the full audit
Loop through all URLs with rate limiting, extract data, and save to CSV.
#E8A0BF">import csv
#E8A0BF">import time
#E8A0BF">def audit_site(sitemap_url, output_file=#A8D4A0">"audit.csv"):
#A8D4A0">""#A8D4A0">"Full site audit: parse sitemap, extract data, save to CSV."#A8D4A0">""
#E8A0BF">print(#A8D4A0">"Fetching sitemap...")
urls = get_urls_from_sitemap(sitemap_url)
#E8A0BF">print(#A8D4A0">"Found " + str(#E8A0BF">len(urls)) + #A8D4A0">" URLs")
results = []
#E8A0BF">for i, url #E8A0BF">in enumerate(urls):
#E8A0BF">print(#A8D4A0">" [" + str(i+1) + #A8D4A0">"/" + str(#E8A0BF">len(urls)) + #A8D4A0">"] " + url)
data = extract_page_data(url)
results.#87CEEB">append(data)
# Rate limit: 1 request per second
time.#87CEEB">sleep(1)
# Write to CSV
#E8A0BF">if results:
#E8A0BF">with #E8A0BF">open(output_file, #A8D4A0">"w", newline=#A8D4A0">"") #E8A0BF">as f:
writer = csv.#87CEEB">DictWriter(f, fieldnames=results[0].keys())
writer.#87CEEB">writeheader()
#E8A0BF">for row #E8A0BF">in results:
writer.#87CEEB">writerow(row)
#E8A0BF">print(#A8D4A0">"Audit complete. Saved to " + output_file)
#E8A0BF">return results
# Run the audit
audit_site(#A8D4A0">"https://example.com/sitemap.xml")Step 4: Flag SEO issues
Automatically scan the results for common SEO problems.
#E8A0BF">def find_seo_issues(results):
#A8D4A0">""#A8D4A0">"Flag common SEO problems."#A8D4A0">""
issues = []
#E8A0BF">for page #E8A0BF">in results:
url = page.#87CEEB">get(#A8D4A0">"url", #A8D4A0">"")
# Missing title
#E8A0BF">if #E8A0BF">not page.#87CEEB">get(#A8D4A0">"title"):
issues.#87CEEB">append({#A8D4A0">"url": url, #A8D4A0">"issue": #A8D4A0">"Missing title tag"})
# Title too long
title = page.#87CEEB">get(#A8D4A0">"title", #A8D4A0">"")
#E8A0BF">if #E8A0BF">len(title) > 60:
issues.#87CEEB">append({
#A8D4A0">"url": url,
#A8D4A0">"issue": #A8D4A0">"Title too long (" + str(#E8A0BF">len(title)) + #A8D4A0">" chars)"
})
# Missing meta description
#E8A0BF">if #E8A0BF">not page.#87CEEB">get(#A8D4A0">"meta_description"):
issues.#87CEEB">append({#A8D4A0">"url": url, #A8D4A0">"issue": #A8D4A0">"Missing meta description"})
# Missing H1
#E8A0BF">if #E8A0BF">not page.#87CEEB">get(#A8D4A0">"h1"):
issues.#87CEEB">append({#A8D4A0">"url": url, #A8D4A0">"issue": #A8D4A0">"Missing H1 tag"})
# Thin content
wc = page.#87CEEB">get(#A8D4A0">"word_count", 0)
#E8A0BF">if wc < 300:
issues.#87CEEB">append({
#A8D4A0">"url": url,
#A8D4A0">"issue": #A8D4A0">"Thin content (" + str(wc) + #A8D4A0">" words)"
})
# Canonical mismatch
canonical = page.#87CEEB">get(#A8D4A0">"canonical", #A8D4A0">"")
#E8A0BF">if canonical #E8A0BF">and canonical != url:
issues.#87CEEB">append({
#A8D4A0">"url": url,
#A8D4A0">"issue": #A8D4A0">"Canonical mismatch: " + canonical
})
#E8A0BF">return issuesAudit any site in minutes
Get your API key in 30 seconds. 100 free requests/month — enough to audit a 100-page site.
Get Your API KeyFrequently asked questions
A sitemap.xml is an XML file that lists all the important URLs on a website. It helps search engine crawlers discover and index pages. Most sites publish one at /sitemap.xml. It's also a goldmine for scrapers — it gives you a complete list of URLs to scrape without crawling.
Yes, but you'll need to crawl the site by following links from the homepage. This is slower and less complete than using a sitemap. SnapRender's /scrape endpoint with format: "markdown" makes it easy to extract links from rendered pages for crawling.
Large sites use a sitemap index file that points to multiple sitemap files. The index uses <sitemapindex> and <sitemap> tags instead of <urlset> and <url>. Parse the index first, then fetch and parse each child sitemap. The script in this tutorial handles both formats.
Common use cases: SEO audits (check titles, meta descriptions, broken pages), content inventories, competitive analysis (see what pages competitors have), migration planning (map old URLs to new ones), and monitoring (track when pages change).