Files
molenda.net/eeeeee/scraper.py
T
Sebastian Molenda ab96d82fcf init
2026-05-12 21:10:38 +02:00

62 lines
2.4 KiB
Python

import os
from lxml import etree
# Configuration
SOURCE_DIR = './doc-en' # Path to your cloned php/doc-en repo
TEXT_FILE = 'php_text.txt'
CODE_FILE = 'php_code.txt'
COMMENTS_FILE = 'php_comments.txt' # Note: Basic placeholders from XML
def clean_text(text):
if not text: return ""
return " ".join(text.split())
def parse_php_docs():
# Ensure files are fresh
for f in [TEXT_FILE, CODE_FILE]:
if os.path.exists(f): os.remove(f)
print("Starting extraction... this may take a minute.")
for root_dir, dirs, files in os.walk(SOURCE_DIR):
for file in files:
if file.endswith(".xml"):
file_path = os.path.join(root_dir, file)
try:
# We use recover=True because some XML entities might be missing
parser = etree.XMLParser(recover=True, remove_comments=False)
tree = etree.parse(file_path, parser)
root = tree.getroot()
# 1. Extract CODE Snippets
# <programlisting> usually contains the actual PHP examples
with open(CODE_FILE, 'a', encoding='utf-8') as f_code:
snippets = root.xpath("//programlisting")
for s in snippets:
code = s.text
if code:
f_code.write(f"--- Source: {file} ---\n")
f_code.write(code.strip() + "\n\n")
# 2. Extract TEXT Documentation
# Target paragraphs <para> and titles <title>
with open(TEXT_FILE, 'a', encoding='utf-8') as f_text:
# Get text but EXCLUDE text inside code blocks
paras = root.xpath("//para | //title | //simpara")
for p in paras:
# Ensure we don't grab text that is a descendant of a code block
if not p.xpath("ancestor::programlisting"):
txt = "".join(p.itertext()).strip()
if txt:
f_text.write(txt + "\n")
except Exception as e:
# Some files are meta-files/entities and might fail; we skip them
continue
print(f"Done! Created {TEXT_FILE} and {CODE_FILE}")
if __name__ == "__main__":
parse_php_docs()