update crawl blog
This commit is contained in:
+710
@@ -0,0 +1,710 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generic blog crawler for RAG using Firecrawl.
|
||||
|
||||
Discovery is sitemap-first (Yoast/WordPress), extraction is Firecrawl
|
||||
single-page scrape. Outputs:
|
||||
|
||||
articles.jsonl article-level structured records
|
||||
chunks.jsonl chunk-level records for embedding/RAG
|
||||
keywords.json keyword dictionary for extraction
|
||||
urls.json discovered URL list with sitemap lastmod
|
||||
raw/<slug>.json raw Firecrawl response per article
|
||||
markdown/<slug>.md extracted markdown per article
|
||||
errors.jsonl failed URLs/errors
|
||||
summary.json crawl summary
|
||||
|
||||
Usage:
|
||||
python3 crawl_blog.py --sitemap https://example.com/post-sitemap.xml --limit 5
|
||||
python3 crawl_blog.py --sitemap https://example.com/post-sitemap.xml --all
|
||||
python3 crawl_blog.py --sitemap https://example.com/post-sitemap.xml --all --keywords my_keywords.json
|
||||
|
||||
Requires FIRECRAWL_API_KEY in environment or .env file.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import hashlib
|
||||
import html
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
FIRECRAWL_SCRAPE_URL = "https://api.firecrawl.dev/v1/scrape"
|
||||
|
||||
WORD_RE = re.compile(r"\S+", re.UNICODE)
|
||||
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+?)\s*$")
|
||||
MD_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
||||
MD_IMAGE_RE = re.compile(r"!\[[^\]]*\]\([^)]+\)")
|
||||
HTML_TAG_RE = re.compile(r"<[^>]+>")
|
||||
MULTI_SPACE_RE = re.compile(r"[ \t]+")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Environment / dotenv
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_dotenv(path: Path | None = None) -> None:
|
||||
"""Load .env from project root or given path."""
|
||||
candidates = [Path(__file__).parent / ".env"]
|
||||
if path:
|
||||
candidates.insert(0, path)
|
||||
for env_path in candidates:
|
||||
if not env_path.exists():
|
||||
continue
|
||||
for raw in env_path.read_text(encoding="utf-8", errors="ignore").splitlines():
|
||||
line = raw.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, val = line.split("=", 1)
|
||||
key = key.strip()
|
||||
val = val.strip().strip('"').strip("'")
|
||||
if key and key not in os.environ:
|
||||
os.environ[key] = val
|
||||
break
|
||||
|
||||
|
||||
def now_iso() -> str:
|
||||
return dt.datetime.now(dt.timezone.utc).isoformat()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# URL / sitemap helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fetch_bytes(url: str, timeout: int = 30) -> bytes:
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "BlogCrawler-RAG/1.0",
|
||||
"Accept": "application/xml,text/xml,text/html,*/*",
|
||||
},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.read()
|
||||
|
||||
|
||||
def parse_sitemap(url: str, domain_filter: str | None = None) -> list[dict[str, str | None]]:
|
||||
"""Return [{'url': ..., 'lastmod': ...}] from a sitemap URL.
|
||||
|
||||
If domain_filter is provided, only include URLs matching that domain.
|
||||
Works with Yoast (post-sitemap.xml) and generic WordPress sitemaps.
|
||||
"""
|
||||
data = fetch_bytes(url)
|
||||
root = ET.fromstring(data)
|
||||
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
||||
|
||||
# Check if this is a sitemap index (contains other sitemaps)
|
||||
sitemaps = root.findall(".//sm:sitemap/sm:loc", ns)
|
||||
if sitemaps:
|
||||
# This is a sitemap index - recursively fetch child sitemaps
|
||||
out: list[dict[str, str | None]] = []
|
||||
for loc_el in sitemaps:
|
||||
child_url = loc_el.text
|
||||
if child_url:
|
||||
try:
|
||||
child_results = parse_sitemap(child_url, domain_filter)
|
||||
out.extend(child_results)
|
||||
except Exception as e:
|
||||
print(f" WARN: failed to fetch child sitemap {child_url}: {e}", file=sys.stderr)
|
||||
return out
|
||||
|
||||
# Regular sitemap - extract URLs
|
||||
out: list[dict[str, str | None]] = []
|
||||
for url_el in root.findall(".//sm:url", ns):
|
||||
loc_el = url_el.find("sm:loc", ns)
|
||||
if loc_el is None or not loc_el.text:
|
||||
continue
|
||||
lastmod_el = url_el.find("sm:lastmod", ns)
|
||||
loc = loc_el.text.strip()
|
||||
lastmod = lastmod_el.text.strip() if lastmod_el is not None and lastmod_el.text else None
|
||||
if domain_filter and domain_filter not in loc:
|
||||
continue
|
||||
out.append({"url": loc, "lastmod": lastmod})
|
||||
return out
|
||||
|
||||
|
||||
def slug_from_url(url: str) -> str:
|
||||
path = urllib.parse.urlparse(url).path.strip("/")
|
||||
if not path:
|
||||
path = "index"
|
||||
slug = re.sub(r"\.(html?|php)$", "", path)
|
||||
slug = re.sub(r"[^a-zA-Z0-9_-]+", "-", slug).strip("-").lower()
|
||||
if not slug:
|
||||
slug = hashlib.sha1(url.encode()).hexdigest()[:12]
|
||||
return slug[:160]
|
||||
|
||||
|
||||
def article_id_from_url(url: str, prefix: str = "blog") -> str:
|
||||
return f"{prefix}_" + slug_from_url(url).replace("-", "_")
|
||||
|
||||
|
||||
def source_from_url(url: str) -> str:
|
||||
"""Extract domain name from URL as source identifier."""
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
domain = parsed.netloc
|
||||
if domain.startswith("www."):
|
||||
domain = domain[4:]
|
||||
return domain
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Firecrawl API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def firecrawl_scrape(url: str, api_key: str, timeout: int = 120) -> tuple[int, dict[str, Any]]:
|
||||
payload = {
|
||||
"url": url,
|
||||
"formats": ["markdown"],
|
||||
"onlyMainContent": True,
|
||||
"waitFor": 1000,
|
||||
"timeout": timeout * 1000,
|
||||
}
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
FIRECRAWL_SCRAPE_URL,
|
||||
data=body,
|
||||
method="POST",
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "BlogCrawler-RAG/1.0",
|
||||
},
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout + 20) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
return resp.status, json.loads(raw)
|
||||
except urllib.error.HTTPError as e:
|
||||
raw = e.read().decode("utf-8", errors="replace")
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except Exception:
|
||||
data = {"error": raw}
|
||||
return e.code, data
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Markdown / text processing
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def strip_markdown_to_text(markdown: str) -> str:
|
||||
text = markdown.replace("\r\n", "\n")
|
||||
text = MD_IMAGE_RE.sub("", text)
|
||||
text = MD_LINK_RE.sub(r"\1", text)
|
||||
text = re.sub(r"```.*?```", lambda m: m.group(0), text, flags=re.S)
|
||||
text = re.sub(r"^#{1,6}\s+", "", text, flags=re.M)
|
||||
text = re.sub(r"[*_`~]", "", text)
|
||||
text = HTML_TAG_RE.sub(" ", text)
|
||||
text = html.unescape(text)
|
||||
text = MULTI_SPACE_RE.sub(" ", text)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def clean_markdown(markdown: str) -> str:
|
||||
"""Light cleanup for common boilerplate while preserving content."""
|
||||
lines = markdown.replace("\r\n", "\n").split("\n")
|
||||
drop_contains = [
|
||||
"Press enter for Accessibility",
|
||||
"Accessibility menu",
|
||||
"Popup heading",
|
||||
"Skip to main",
|
||||
"Bỏ qua nội dung",
|
||||
"close",
|
||||
]
|
||||
cleaned: list[str] = []
|
||||
for line in lines:
|
||||
s = line.strip()
|
||||
if any(x.lower() in s.lower() for x in drop_contains):
|
||||
continue
|
||||
cleaned.append(line.rstrip())
|
||||
text = "\n".join(cleaned)
|
||||
text = re.sub(r"\n{4,}", "\n\n\n", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def extract_title(data: dict[str, Any], markdown: str, fallback_url: str) -> str:
|
||||
meta = data.get("metadata") or {}
|
||||
for key in ("title", "ogTitle"):
|
||||
val = meta.get(key)
|
||||
if isinstance(val, str) and val.strip():
|
||||
return html.unescape(val.strip())
|
||||
for line in markdown.splitlines():
|
||||
m = HEADING_RE.match(line.strip())
|
||||
if m:
|
||||
return m.group(2).strip()
|
||||
return slug_from_url(fallback_url).replace("-", " ").title()
|
||||
|
||||
|
||||
def extract_description(data: dict[str, Any], text: str) -> str | None:
|
||||
meta = data.get("metadata") or {}
|
||||
for key in ("description", "ogDescription"):
|
||||
val = meta.get(key)
|
||||
if isinstance(val, str) and val.strip():
|
||||
return html.unescape(val.strip())
|
||||
return text[:300].strip() if text else None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Keyword extraction (replaces product mentions from original)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_keywords(path: Path | None) -> list[dict[str, Any]]:
|
||||
"""Load keyword dictionary from JSON.
|
||||
|
||||
Expected JSON shape (list of categories):
|
||||
[
|
||||
{
|
||||
"category": "hardware",
|
||||
"keywords": ["Raspberry Pi", "Arduino", "ESP32"]
|
||||
},
|
||||
{
|
||||
"category": "software",
|
||||
"keywords": ["Docker", "Ubuntu", "Debian"]
|
||||
}
|
||||
]
|
||||
|
||||
Or a flat list of keyword strings:
|
||||
["Raspberry Pi", "Docker", "Home Assistant"]
|
||||
"""
|
||||
if path is None or not path.exists():
|
||||
return []
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
if isinstance(data, list) and all(isinstance(x, str) for x in data):
|
||||
# Flat list of strings -> wrap into single category
|
||||
return [{"category": "general", "keywords": data}]
|
||||
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(f"keywords JSON must be a list: {path}")
|
||||
|
||||
categories: list[dict[str, Any]] = []
|
||||
for row in data:
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
category = str(row.get("category") or "general").strip()
|
||||
kw_list = row.get("keywords") or []
|
||||
if not isinstance(kw_list, list):
|
||||
continue
|
||||
keywords = sorted({str(k).strip() for k in kw_list if str(k).strip()}, key=len, reverse=True)
|
||||
if keywords:
|
||||
categories.append({"category": category, "keywords": keywords})
|
||||
return categories
|
||||
|
||||
|
||||
def _alias_to_regex(alias: str) -> re.Pattern[str]:
|
||||
"""Compile a keyword regex with flexible whitespace and safe boundaries."""
|
||||
alias = html.unescape(alias or "").strip()
|
||||
alias = alias.replace("\u00a0", " ")
|
||||
alias = re.sub(r"[\u2010-\u2015]", "-", alias)
|
||||
pat = re.escape(alias).replace(r"\ ", r"\s+")
|
||||
return re.compile(rf"(?<![A-Za-z0-9]){pat}(?![A-Za-z0-9])", re.I | re.U)
|
||||
|
||||
|
||||
def spans_overlap(a: tuple[int, int], b: tuple[int, int]) -> bool:
|
||||
return a[0] < b[1] and b[0] < a[1]
|
||||
|
||||
|
||||
def keyword_mentions_detail(text: str, categories: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""Return keyword mentions grouped by category.
|
||||
|
||||
Longer keywords are processed first and reserve their character spans.
|
||||
This prevents false double-counts such as "Orange Pi 5" and "Orange Pi"
|
||||
matching the same text span.
|
||||
"""
|
||||
if not categories:
|
||||
return []
|
||||
|
||||
hay = text
|
||||
hay = html.unescape(hay or "")
|
||||
hay = hay.replace("\u00a0", " ")
|
||||
hay = re.sub(r"[\u2010-\u2015]", "-", hay)
|
||||
hay = re.sub(r"\s+", " ", hay)
|
||||
|
||||
# Build flat list of (category, keyword, regex) sorted by keyword length desc
|
||||
all_kw: list[tuple[str, str, re.Pattern[str]]] = []
|
||||
for cat in categories:
|
||||
for kw in cat["keywords"]:
|
||||
all_kw.append((cat["category"], kw, _alias_to_regex(kw)))
|
||||
all_kw.sort(key=lambda x: len(x[1]), reverse=True)
|
||||
|
||||
details: list[dict[str, Any]] = []
|
||||
occupied: list[tuple[int, int]] = []
|
||||
|
||||
# Group results by category
|
||||
cat_results: dict[str, dict[str, Any]] = {}
|
||||
|
||||
for category, keyword, rx in all_kw:
|
||||
for m in rx.finditer(hay):
|
||||
span = (m.start(), m.end())
|
||||
if any(spans_overlap(span, used) for used in occupied):
|
||||
continue
|
||||
occupied.append(span)
|
||||
|
||||
if category not in cat_results:
|
||||
cat_results[category] = {
|
||||
"category": category,
|
||||
"matched_keywords": {},
|
||||
"total_count": 0,
|
||||
}
|
||||
entry = cat_results[category]
|
||||
entry["matched_keywords"].setdefault(keyword, 0)
|
||||
entry["matched_keywords"][keyword] += 1
|
||||
entry["total_count"] += 1
|
||||
|
||||
for cat_data in cat_results.values():
|
||||
cat_data["matched_keywords"] = dict(
|
||||
sorted(cat_data["matched_keywords"].items(), key=lambda x: -x[1])
|
||||
)
|
||||
details.append(cat_data)
|
||||
|
||||
return sorted(details, key=lambda d: -d["total_count"])
|
||||
|
||||
|
||||
def keyword_mentions(text: str, categories: list[dict[str, Any]]) -> list[str]:
|
||||
"""Return flat list of all matched keywords."""
|
||||
if not categories:
|
||||
return []
|
||||
all_matched = []
|
||||
for detail in keyword_mentions_detail(text, categories):
|
||||
all_matched.extend(detail["matched_keywords"].keys())
|
||||
return all_matched
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Topic inference
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def infer_topic(title: str, text: str, categories: list[dict[str, Any]] | None = None) -> str | None:
|
||||
"""Infer topic from content. Uses keyword categories if available."""
|
||||
hay = (title + "\n" + text[:2000]).lower()
|
||||
|
||||
# If categories are provided, use them for topic inference
|
||||
if categories:
|
||||
best_category = None
|
||||
best_count = 0
|
||||
for cat in categories:
|
||||
count = sum(1 for kw in cat["keywords"] if kw.lower() in hay)
|
||||
if count > best_count:
|
||||
best_count = count
|
||||
best_category = cat["category"]
|
||||
if best_category and best_count > 0:
|
||||
return best_category
|
||||
|
||||
# Fallback: common topic rules
|
||||
rules = [
|
||||
("docker", "docker"),
|
||||
("kubernetes", "kubernetes"),
|
||||
("linux", "linux"),
|
||||
("ubuntu", "linux"),
|
||||
("debian", "linux"),
|
||||
("python", "programming"),
|
||||
("javascript", "programming"),
|
||||
("home assistant", "home assistant"),
|
||||
("iot", "iot"),
|
||||
("ai", "ai"),
|
||||
("machine learning", "ai"),
|
||||
]
|
||||
for needle, topic in rules:
|
||||
if needle in hay:
|
||||
return topic
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Chunking
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def chunk_markdown(
|
||||
markdown: str,
|
||||
article: dict[str, Any],
|
||||
categories: list[dict[str, Any]] | None = None,
|
||||
max_words: int = 650,
|
||||
overlap_words: int = 100,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Chunk markdown by paragraphs/headings with approximate word limits."""
|
||||
blocks = re.split(r"\n\s*\n", markdown.strip()) if markdown.strip() else []
|
||||
chunks: list[dict[str, Any]] = []
|
||||
current: list[str] = []
|
||||
current_words = 0
|
||||
section = None
|
||||
current_section = None
|
||||
|
||||
def words_of(s: str) -> list[str]:
|
||||
return WORD_RE.findall(s)
|
||||
|
||||
def flush() -> None:
|
||||
nonlocal current, current_words, current_section
|
||||
content = "\n\n".join(current).strip()
|
||||
if not content:
|
||||
current = []
|
||||
current_words = 0
|
||||
return
|
||||
idx = len(chunks)
|
||||
chunk_kw = keyword_mentions(content, categories or []) if categories else []
|
||||
chunk_kw_detail = keyword_mentions_detail(content, categories or []) if categories else []
|
||||
chunks.append({
|
||||
"chunk_id": f"{article['id']}__chunk_{idx:04d}",
|
||||
"article_id": article["id"],
|
||||
"url": article["url"],
|
||||
"title": article["title"],
|
||||
"section": current_section,
|
||||
"language": article.get("language", "en"),
|
||||
"content": content,
|
||||
"metadata": {
|
||||
"source": article.get("source"),
|
||||
"type": article.get("type"),
|
||||
"keyword_mentions": chunk_kw,
|
||||
"keyword_mentions_detail": chunk_kw_detail,
|
||||
"article_keyword_mentions": article.get("keywords", []),
|
||||
"topic": article.get("topic"),
|
||||
"modified_at": article.get("modified_at"),
|
||||
},
|
||||
})
|
||||
if overlap_words > 0:
|
||||
tail: list[str] = []
|
||||
count = 0
|
||||
for b in reversed(current):
|
||||
bw = len(words_of(b))
|
||||
if tail and count + bw > overlap_words:
|
||||
break
|
||||
tail.insert(0, b)
|
||||
count += bw
|
||||
current = tail
|
||||
current_words = count
|
||||
else:
|
||||
current = []
|
||||
current_words = 0
|
||||
|
||||
for block in blocks:
|
||||
b = block.strip()
|
||||
if not b:
|
||||
continue
|
||||
m = HEADING_RE.match(b.splitlines()[0].strip())
|
||||
if m:
|
||||
section = m.group(2).strip()
|
||||
bw = len(words_of(b))
|
||||
if current and current_words + bw > max_words:
|
||||
flush()
|
||||
if not current:
|
||||
current_section = section
|
||||
if bw > max_words * 1.5:
|
||||
words = words_of(b)
|
||||
start = 0
|
||||
while start < len(words):
|
||||
part = " ".join(words[start:start + max_words])
|
||||
if current and current_words + len(words_of(part)) > max_words:
|
||||
flush()
|
||||
current.append(part)
|
||||
current_words += len(words_of(part))
|
||||
flush()
|
||||
start += max_words - overlap_words
|
||||
continue
|
||||
current.append(b)
|
||||
current_words += bw
|
||||
if current:
|
||||
flush()
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# JSONL helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def append_jsonl(path: Path, record: dict[str, Any]) -> None:
|
||||
with path.open("a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(record, ensure_ascii=False, sort_keys=True) + "\n")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generic blog crawler for RAG (Firecrawl + sitemap)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Crawl 5 articles from a blog
|
||||
python3 crawl_blog.py --sitemap https://example.com/post-sitemap.xml --limit 5
|
||||
|
||||
# Crawl all articles
|
||||
python3 crawl_blog.py --sitemap https://example.com/post-sitemap.xml --all
|
||||
|
||||
# Use custom keywords for extraction
|
||||
python3 crawl_blog.py --sitemap https://example.com/post-sitemap.xml --all --keywords keywords.json
|
||||
|
||||
# Output to custom directory
|
||||
python3 crawl_blog.py --sitemap https://example.com/post-sitemap.xml --all --out-dir ./my_data
|
||||
""",
|
||||
)
|
||||
parser.add_argument("--sitemap", required=True, help="Sitemap URL (e.g. https://example.com/post-sitemap.xml)")
|
||||
parser.add_argument("--out-dir", type=Path, default=Path("./blog_data"), help="Output directory")
|
||||
parser.add_argument("--keywords", type=Path, default=None, help="Keywords JSON path; defaults to <out-dir>/keywords.json")
|
||||
parser.add_argument("--limit", type=int, default=None, help="Only process first N article URLs")
|
||||
parser.add_argument("--all", action="store_true", help="Process all discovered article URLs")
|
||||
parser.add_argument("--sleep", type=float, default=1.0, help="Delay between Firecrawl calls (seconds)")
|
||||
parser.add_argument("--force", action="store_true", help="Re-scrape even if raw file exists")
|
||||
parser.add_argument("--max-words", type=int, default=650, help="Target words per chunk")
|
||||
parser.add_argument("--overlap-words", type=int, default=100, help="Overlap words between chunks")
|
||||
parser.add_argument("--language", default="en", help="Default language code for articles")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if not args.all and args.limit is None:
|
||||
args.limit = 5
|
||||
|
||||
load_dotenv()
|
||||
api_key = os.environ.get("FIRECRAWL_API_KEY")
|
||||
if not api_key:
|
||||
print("ERROR: FIRECRAWL_API_KEY is not set in environment or .env file", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
out_dir: Path = args.out_dir
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Load keywords
|
||||
keywords_path: Path = args.keywords or (out_dir / "keywords.json")
|
||||
categories = load_keywords(keywords_path)
|
||||
|
||||
# Create output directories
|
||||
raw_dir = out_dir / "raw"
|
||||
md_dir = out_dir / "markdown"
|
||||
raw_dir.mkdir(parents=True, exist_ok=True)
|
||||
md_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Discover URLs from sitemap
|
||||
source_domain = source_from_url(args.sitemap)
|
||||
urls = parse_sitemap(args.sitemap)
|
||||
urls_path = out_dir / "urls.json"
|
||||
urls_path.write_text(
|
||||
json.dumps({"sitemap": args.sitemap, "source": source_domain, "count": len(urls), "urls": urls}, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
selected = urls if args.all else urls[: args.limit]
|
||||
articles_path = out_dir / "articles.jsonl"
|
||||
chunks_path = out_dir / "chunks.jsonl"
|
||||
errors_path = out_dir / "errors.jsonl"
|
||||
|
||||
if args.force:
|
||||
articles_path.unlink(missing_ok=True)
|
||||
chunks_path.unlink(missing_ok=True)
|
||||
errors_path.unlink(missing_ok=True)
|
||||
|
||||
print(f"Source: {source_domain}")
|
||||
print(f"Discovered article URLs: {len(urls)}")
|
||||
print(f"Processing: {len(selected)}")
|
||||
print(f"Output: {out_dir}")
|
||||
print(f"Keywords: {keywords_path} ({sum(len(c['keywords']) for c in categories)} keywords in {len(categories)} categories)")
|
||||
|
||||
ok = 0
|
||||
failed = 0
|
||||
total_chunks = 0
|
||||
|
||||
for idx, item in enumerate(selected, 1):
|
||||
url = str(item["url"])
|
||||
lastmod = item.get("lastmod")
|
||||
slug = slug_from_url(url)
|
||||
raw_path = raw_dir / f"{slug}.json"
|
||||
md_path = md_dir / f"{slug}.md"
|
||||
print(f"[{idx}/{len(selected)}] {url}")
|
||||
|
||||
# Use cached raw if available
|
||||
if raw_path.exists() and not args.force:
|
||||
try:
|
||||
raw_data = json.loads(raw_path.read_text(encoding="utf-8"))
|
||||
status = int(raw_data.get("_http_status", 200))
|
||||
except Exception as e:
|
||||
append_jsonl(errors_path, {"url": url, "error": f"read cached raw failed: {e}", "at": now_iso()})
|
||||
failed += 1
|
||||
continue
|
||||
else:
|
||||
status, raw_data = firecrawl_scrape(url, api_key)
|
||||
raw_data["_http_status"] = status
|
||||
raw_data["_source_url"] = url
|
||||
raw_data["_scraped_at"] = now_iso()
|
||||
raw_path.write_text(json.dumps(raw_data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
time.sleep(args.sleep)
|
||||
|
||||
if status >= 400 or not raw_data.get("success"):
|
||||
append_jsonl(errors_path, {"url": url, "http_status": status, "error": raw_data, "at": now_iso()})
|
||||
print(f" ERROR status={status} success={raw_data.get('success')}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
data = raw_data.get("data") or {}
|
||||
markdown = data.get("markdown") or ""
|
||||
markdown = clean_markdown(markdown)
|
||||
text = strip_markdown_to_text(markdown)
|
||||
if len(text) < 100:
|
||||
append_jsonl(errors_path, {"url": url, "http_status": status, "error": "too little text", "text_chars": len(text), "at": now_iso()})
|
||||
print(f" ERROR too little text chars={len(text)}")
|
||||
failed += 1
|
||||
continue
|
||||
|
||||
title = extract_title(data, markdown, url)
|
||||
desc = extract_description(data, text)
|
||||
kw_detail = keyword_mentions_detail(title + "\n" + text, categories)
|
||||
kw_list = [kw for cat in kw_detail for kw in cat["matched_keywords"].keys()]
|
||||
|
||||
article = {
|
||||
"id": article_id_from_url(url),
|
||||
"url": url,
|
||||
"source": source_domain,
|
||||
"type": "blog_article",
|
||||
"title": title,
|
||||
"description": desc,
|
||||
"published_at": None,
|
||||
"modified_at": lastmod,
|
||||
"language": args.language,
|
||||
"markdown": markdown,
|
||||
"text": text,
|
||||
"tags": [],
|
||||
"keywords": kw_list,
|
||||
"keyword_mentions_detail": kw_detail,
|
||||
"topic": infer_topic(title, text, categories),
|
||||
"scraped_at": raw_data.get("_scraped_at") or now_iso(),
|
||||
"metadata": data.get("metadata") or {},
|
||||
}
|
||||
chunks = chunk_markdown(markdown, article, categories=categories, max_words=args.max_words, overlap_words=args.overlap_words)
|
||||
|
||||
md_path.write_text(markdown, encoding="utf-8")
|
||||
append_jsonl(articles_path, article)
|
||||
for chunk in chunks:
|
||||
append_jsonl(chunks_path, chunk)
|
||||
|
||||
ok += 1
|
||||
total_chunks += len(chunks)
|
||||
print(f" OK title={title!r} markdown_chars={len(markdown)} chunks={len(chunks)} keywords={kw_list[:5]}")
|
||||
|
||||
summary = {
|
||||
"sitemap": args.sitemap,
|
||||
"source": source_domain,
|
||||
"discovered": len(urls),
|
||||
"processed": len(selected),
|
||||
"ok": ok,
|
||||
"failed": failed,
|
||||
"chunks": total_chunks,
|
||||
"out_dir": str(out_dir),
|
||||
"keywords_path": str(keywords_path),
|
||||
"keywords_loaded": sum(len(c["keywords"]) for c in categories),
|
||||
"finished_at": now_iso(),
|
||||
}
|
||||
(out_dir / "summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print("SUMMARY", json.dumps(summary, ensure_ascii=False))
|
||||
return 0 if failed == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user