SEO Technical Audit Workflow
Comprehensive technical SEO audit for websites including crawling, analysis, and reporting
Workflow Information
ID: seo_technical_audit_v1
Namespace: default
Version: 1.0.1
Created: 2025-07-08
Updated: 2025-07-08
Tasks: 13
Quick Actions
Inputs
| Name | Type | Required | Default |
|---|---|---|---|
target_url |
string | Required | None |
crawl_depth |
integer | Optional |
3
|
max_pages |
integer | Optional |
100
|
audit_type |
string | Optional |
comprehensive
|
Outputs
| Name | Type | Source |
|---|---|---|
ai_insights |
string | AI-generated SEO insights |
audit_report |
string | Complete SEO technical audit report |
recommendations |
string | Prioritized optimization recommendations |
technical_summary |
string | Technical issues summary |
Tasks
validate_url
scriptNo description
check_robots_txt
httpNo description
analyze_robots
scriptNo description
crawl_website
mcpNo description
process_crawl_data
scriptNo description
technical_seo_analysis
scriptNo description
get_serp_data
mcpNo description
get_domain_metrics
mcpNo description
get_backlinks
mcpNo description
ai_seo_analysis
ai_agentNo description
generate_recommendations
ai_agentNo description
compile_report
scriptNo description
store_audit_results
storageNo description
YAML Source
id: seo_technical_audit_v1
name: SEO Technical Audit Workflow
tasks:
- id: validate_url
type: script
script: "import json\nimport urllib.parse\nfrom urllib.parse import urlparse\n\n\
target_url = \"${target_url}\"\n\n# Parse and validate URL\ntry:\n parsed =\
\ urlparse(target_url)\n if not parsed.scheme or not parsed.netloc:\n \
\ raise ValueError(\"Invalid URL format\")\n \n domain = parsed.netloc\n\
\ base_url = f\"{parsed.scheme}://{parsed.netloc}\"\n \n output = {\n\
\ \"url\": target_url,\n \"domain\": domain,\n \"base_url\"\
: base_url,\n \"robots_txt_url\": f\"{base_url}/robots.txt\",\n \
\ \"sitemap_url\": f\"{base_url}/sitemap.xml\",\n \"validation_status\"\
: \"valid\"\n }\nexcept Exception as e:\n output = {\n \"validation_status\"\
: \"invalid\",\n \"error\": str(e)\n }\n\nprint(f\"__OUTPUTS__ {json.dumps(output)}\"\
)\n"
timeout_seconds: 30
- id: check_robots_txt
type: http
when: validation_status == 'valid'
request:
url: ${validate_url.robots_txt_url}
method: GET
depends_on:
- validate_url
retry_policy:
max_attempts: 2
initial_interval: 1
timeout_seconds: 30
continue_on_error: true
- id: analyze_robots
type: script
script: "import json\nimport os\n\nrobots_data = os.environ.get('check_robots_txt.response',\
\ '{}')\nstatus_code = os.environ.get('check_robots_txt.status_code', '404')\n\
\nanalysis = {\n \"has_robots_txt\": status_code == \"200\",\n \"allows_crawling\"\
: True,\n \"disallowed_paths\": [],\n \"crawl_delay\": 0,\n \"sitemaps\"\
: []\n}\n\nif status_code == \"200\":\n try:\n response = json.loads(robots_data)\n\
\ content = response.get('body', '')\n \n # Parse robots.txt\
\ content\n for line in content.split('\\n'):\n line = line.strip().lower()\n\
\ if line.startswith('disallow:'):\n path = line.replace('disallow:',\
\ '').strip()\n if path and path != '/':\n analysis['disallowed_paths'].append(path)\n\
\ elif line.startswith('crawl-delay:'):\n try:\n \
\ analysis['crawl_delay'] = int(line.replace('crawl-delay:', '').strip())\n\
\ except:\n pass\n elif line.startswith('sitemap:'):\n\
\ sitemap = line.replace('sitemap:', '').strip()\n \
\ analysis['sitemaps'].append(sitemap)\n except:\n pass\n\nprint(f\"\
__OUTPUTS__ {json.dumps(analysis)}\")\n"
depends_on:
- check_robots_txt
timeout_seconds: 30
- id: crawl_website
type: mcp
tool_name: crawl
depends_on:
- analyze_robots
retry_policy:
max_attempts: 2
initial_interval: 5
deployment_id: pod-9qgkf5zz
tool_arguments:
url: ${target_url}
limit: ${max_pages}
max_depth: ${crawl_depth}
include_raw_html: false
only_main_content: true
timeout_seconds: 300
- id: process_crawl_data
type: script
script: "import json\nimport os\nfrom urllib.parse import urlparse\n\n# Try different\
\ access patterns for crawl data\ncrawl_data = None\nfor var_name in ['crawl_website.result',\
\ 'crawl_website.output', 'crawl_website']:\n var_value = os.environ.get(var_name,\
\ '')\n if var_value and var_value not in ['{}', '[]']:\n try:\n \
\ crawl_data = json.loads(var_value)\n break\n except:\n\
\ continue\n\nif not crawl_data:\n crawl_data = {\"data\": []}\n\
\npages_data = crawl_data.get('data', [])\n\n# Process each page\nprocessed_pages\
\ = []\nfor page in pages_data[:50]: # Limit to 50 pages for processing\n \
\ page_info = {\n \"url\": page.get('url', ''),\n \"title\": page.get('metadata',\
\ {}).get('title', ''),\n \"description\": page.get('metadata', {}).get('description',\
\ ''),\n \"status_code\": page.get('metadata', {}).get('statusCode', 200),\n\
\ \"content_length\": len(page.get('content', '')),\n \"has_title\"\
: bool(page.get('metadata', {}).get('title')),\n \"has_description\": bool(page.get('metadata',\
\ {}).get('description')),\n \"word_count\": len(page.get('content', '').split()),\n\
\ \"headers\": page.get('metadata', {}).get('headers', {})\n }\n \
\ processed_pages.append(page_info)\n\nsummary = {\n \"total_pages_crawled\"\
: len(pages_data),\n \"pages_analyzed\": len(processed_pages),\n \"pages_data\"\
: processed_pages,\n \"crawl_status\": \"success\" if pages_data else \"no_data\"\
\n}\n\nprint(f\"__OUTPUTS__ {json.dumps(summary)}\")\n"
depends_on:
- crawl_website
requirements:
- beautifulsoup4==4.12.2
- lxml==4.9.3
timeout_seconds: 120
- id: technical_seo_analysis
type: script
script: "import json\nimport os\nfrom collections import Counter\n\n# Get processed\
\ data\nprocess_data = json.loads(os.environ.get('process_crawl_data', '{}'))\n\
pages_data = process_data.get('pages_data', [])\n\n# Initialize analysis results\n\
analysis = {\n \"meta_tags\": {\n \"missing_titles\": [],\n \"\
duplicate_titles\": [],\n \"missing_descriptions\": [],\n \"duplicate_descriptions\"\
: [],\n \"title_length_issues\": []\n },\n \"status_codes\": {\n\
\ \"200\": 0,\n \"301\": 0,\n \"302\": 0,\n \"404\"\
: 0,\n \"500\": 0,\n \"other\": 0\n },\n \"content_analysis\"\
: {\n \"thin_content_pages\": [],\n \"average_word_count\": 0,\n\
\ \"pages_under_300_words\": 0\n },\n \"url_structure\": {\n \
\ \"deep_urls\": [],\n \"non_friendly_urls\": []\n },\n \"performance_indicators\"\
: {\n \"large_pages\": []\n }\n}\n\n# Analyze pages\ntitles = []\ndescriptions\
\ = []\nword_counts = []\n\nfor page in pages_data:\n url = page['url']\n \
\ \n # Title analysis\n if not page['has_title']:\n analysis['meta_tags']['missing_titles'].append(url)\n\
\ else:\n title = page['title']\n titles.append(title)\n \
\ if len(title) < 30 or len(title) > 60:\n analysis['meta_tags']['title_length_issues'].append({\n\
\ \"url\": url,\n \"title\": title,\n \
\ \"length\": len(title)\n })\n \n # Description analysis\n\
\ if not page['has_description']:\n analysis['meta_tags']['missing_descriptions'].append(url)\n\
\ else:\n descriptions.append(page['description'])\n \n # Status\
\ code analysis\n status = str(page.get('status_code', 200))\n if status\
\ in analysis['status_codes']:\n analysis['status_codes'][status] += 1\n\
\ else:\n analysis['status_codes']['other'] += 1\n \n # Content\
\ analysis\n word_count = page['word_count']\n word_counts.append(word_count)\n\
\ if word_count < 300:\n analysis['content_analysis']['thin_content_pages'].append({\n\
\ \"url\": url,\n \"word_count\": word_count\n })\n\
\ analysis['content_analysis']['pages_under_300_words'] += 1\n \n \
\ # URL structure\n url_depth = url.count('/') - 2\n if url_depth > 3:\n\
\ analysis['url_structure']['deep_urls'].append({\n \"url\"\
: url,\n \"depth\": url_depth\n })\n \n # Performance\n\
\ if page['content_length'] > 100000:\n analysis['performance_indicators']['large_pages'].append({\n\
\ \"url\": url,\n \"size_bytes\": page['content_length']\n\
\ })\n\n# Find duplicates\ntitle_counts = Counter(titles)\ndesc_counts\
\ = Counter(descriptions)\n\nfor title, count in title_counts.items():\n if\
\ count > 1:\n analysis['meta_tags']['duplicate_titles'].append({\n \
\ \"title\": title,\n \"count\": count\n })\n\nfor desc,\
\ count in desc_counts.items():\n if count > 1:\n analysis['meta_tags']['duplicate_descriptions'].append({\n\
\ \"description\": desc[:100] + \"...\",\n \"count\": count\n\
\ })\n\n# Calculate averages\nif word_counts:\n analysis['content_analysis']['average_word_count']\
\ = sum(word_counts) // len(word_counts)\n\nprint(f\"__OUTPUTS__ {json.dumps(analysis)}\"\
)\n"
depends_on:
- process_crawl_data
timeout_seconds: 120
- id: get_serp_data
type: mcp
tool_name: mcp__dataforseo__serp_organic_live_advanced
depends_on:
- validate_url
deployment_id: pod-ow0pvk2h
tool_arguments:
depth: 10
keyword: ${validate_url.domain}
language_code: en
location_name: United States
search_engine: google
timeout_seconds: 60
continue_on_error: true
- id: get_domain_metrics
type: mcp
tool_name: mcp__dataforseo__dataforseo_labs_google_domain_rank_overview
depends_on:
- validate_url
deployment_id: pod-ow0pvk2h
tool_arguments:
target: ${validate_url.domain}
language_code: en
location_name: United States
timeout_seconds: 60
continue_on_error: true
- id: get_backlinks
type: mcp
tool_name: mcp__dataforseo__backlinks_summary
depends_on:
- validate_url
deployment_id: pod-ow0pvk2h
tool_arguments:
target: ${validate_url.domain}
timeout_seconds: 60
continue_on_error: true
- id: ai_seo_analysis
type: ai_agent
prompt: 'Analyze the SEO technical audit results and provide comprehensive insights:
Technical Analysis: ${technical_seo_analysis}
SERP Data: ${get_serp_data}
Domain Metrics: ${get_domain_metrics}
Provide:
1. Critical SEO issues that need immediate attention
2. Medium priority improvements
3. Quick wins for SEO improvement
4. Technical debt assessment
5. Competitive positioning analysis
Format as structured JSON with actionable recommendations.
'
agent_type: analyst
depends_on:
- technical_seo_analysis
- get_serp_data
- get_domain_metrics
model_client_id: seo_analyzer
- id: generate_recommendations
type: ai_agent
prompt: 'Based on the SEO analysis, generate specific optimization recommendations:
Analysis Results: ${ai_seo_analysis}
Technical Issues: ${technical_seo_analysis}
Create a prioritized action plan with:
1. Immediate fixes (0-7 days)
2. Short-term improvements (1-4 weeks)
3. Long-term strategy (1-3 months)
Include specific implementation steps for each recommendation.
Focus on ROI and impact on search rankings.
'
agent_type: optimizer
depends_on:
- ai_seo_analysis
- technical_seo_analysis
model_client_id: content_optimizer
- id: compile_report
type: script
script: "import json\nimport os\nfrom datetime import datetime\n\n# Gather all analysis\
\ data\ntechnical_analysis = json.loads(os.environ.get('technical_seo_analysis',\
\ '{}'))\nai_analysis = os.environ.get('ai_seo_analysis', '{}')\nrecommendations\
\ = os.environ.get('generate_recommendations', '{}')\nbacklinks = os.environ.get('get_backlinks',\
\ '{}')\n\n# Parse backlink data if available\nbacklink_summary = {}\ntry:\n \
\ backlink_data = json.loads(backlinks)\n if isinstance(backlink_data, dict):\n\
\ backlink_summary = {\n \"total_backlinks\": backlink_data.get('backlinks',\
\ 0),\n \"referring_domains\": backlink_data.get('referring_domains',\
\ 0),\n \"domain_rank\": backlink_data.get('rank', 0)\n }\n\
except:\n backlink_summary = {\"status\": \"unavailable\"}\n\n# Compile comprehensive\
\ report\nreport = {\n \"audit_date\": datetime.utcnow().isoformat(),\n \
\ \"target_url\": \"${target_url}\",\n \"audit_type\": \"${audit_type}\",\n\
\ \"executive_summary\": {\n \"total_pages_analyzed\": technical_analysis.get('content_analysis',\
\ {}).get('pages_under_300_words', 0),\n \"critical_issues_found\": len(technical_analysis.get('meta_tags',\
\ {}).get('missing_titles', [])) + \n len(technical_analysis.get('meta_tags',\
\ {}).get('missing_descriptions', [])),\n \"overall_health_score\": \"\
calculated_below\"\n },\n \"technical_issues\": technical_analysis,\n \
\ \"backlink_profile\": backlink_summary,\n \"ai_insights\": ai_analysis,\n\
\ \"recommendations\": recommendations,\n \"metrics_summary\": {\n \
\ \"meta_tag_issues\": {\n \"missing_titles\": len(technical_analysis.get('meta_tags',\
\ {}).get('missing_titles', [])),\n \"missing_descriptions\": len(technical_analysis.get('meta_tags',\
\ {}).get('missing_descriptions', [])),\n \"duplicate_titles\": len(technical_analysis.get('meta_tags',\
\ {}).get('duplicate_titles', [])),\n \"title_length_issues\": len(technical_analysis.get('meta_tags',\
\ {}).get('title_length_issues', []))\n },\n \"content_issues\"\
: {\n \"thin_content_pages\": technical_analysis.get('content_analysis',\
\ {}).get('pages_under_300_words', 0),\n \"average_word_count\": technical_analysis.get('content_analysis',\
\ {}).get('average_word_count', 0)\n },\n \"technical_issues\":\
\ {\n \"404_errors\": technical_analysis.get('status_codes', {}).get('404',\
\ 0),\n \"redirect_chains\": technical_analysis.get('status_codes',\
\ {}).get('301', 0) + \n technical_analysis.get('status_codes',\
\ {}).get('302', 0),\n \"deep_urls\": len(technical_analysis.get('url_structure',\
\ {}).get('deep_urls', []))\n }\n }\n}\n\n# Calculate health score\n\
total_issues = sum([\n report['metrics_summary']['meta_tag_issues']['missing_titles'],\n\
\ report['metrics_summary']['meta_tag_issues']['missing_descriptions'],\n \
\ report['metrics_summary']['content_issues']['thin_content_pages'],\n report['metrics_summary']['technical_issues']['404_errors']\n\
])\n\nif total_issues == 0:\n health_score = \"Excellent (90-100)\"\nelif total_issues\
\ < 5:\n health_score = \"Good (70-89)\"\nelif total_issues < 15:\n health_score\
\ = \"Fair (50-69)\"\nelse:\n health_score = \"Poor (0-49)\"\n\nreport['executive_summary']['overall_health_score']\
\ = health_score\n\nprint(f\"__OUTPUTS__ {json.dumps(report)}\")\n"
depends_on:
- ai_seo_analysis
- generate_recommendations
- technical_seo_analysis
- get_backlinks
timeout_seconds: 60
- id: store_audit_results
data:
audit_date: ${execution.started_at}
audit_type: ${audit_type}
target_url: ${target_url}
report_data: ${compile_report}
workflow_id: ${workflow.id}
execution_id: ${execution.id}
health_score: ${compile_report.executive_summary.overall_health_score}
type: storage
table: seo_audit_results
operation: insert
depends_on:
- compile_report
continue_on_error: true
inputs:
- name: target_url
type: string
required: true
validation:
pattern: ^https?://.*
description: The URL of the website to audit
- name: crawl_depth
type: integer
default: 3
validation:
max: 10
min: 1
description: Maximum crawl depth
- name: max_pages
type: integer
default: 100
validation:
max: 1000
min: 10
description: Maximum number of pages to crawl
- enum:
- basic
- comprehensive
- focused
name: audit_type
type: string
default: comprehensive
description: Type of audit to perform
outputs:
ai_insights:
source: ai_seo_analysis
description: AI-generated SEO insights
audit_report:
source: compile_report
description: Complete SEO technical audit report
recommendations:
source: generate_recommendations
description: Prioritized optimization recommendations
technical_summary:
source: technical_seo_analysis
description: Technical issues summary
version: 1.0.1
description: Comprehensive technical SEO audit for websites including crawling, analysis,
and reporting
model_clients:
- id: seo_analyzer
config:
model: gpt-4o-mini
api_key: ${env.OPENAI_API_KEY}
max_tokens: 2000
temperature: 0.3
provider: openai
- id: content_optimizer
config:
model: gpt-4o-mini
api_key: ${env.OPENAI_API_KEY}
max_tokens: 1500
temperature: 0.5
provider: openai