62 lines
2.4 KiB
Python
62 lines
2.4 KiB
Python
import os
|
|
from lxml import etree
|
|
|
|
# Configuration
|
|
SOURCE_DIR = './doc-en' # Path to your cloned php/doc-en repo
|
|
TEXT_FILE = 'php_text.txt'
|
|
CODE_FILE = 'php_code.txt'
|
|
COMMENTS_FILE = 'php_comments.txt' # Note: Basic placeholders from XML
|
|
|
|
def clean_text(text):
|
|
if not text: return ""
|
|
return " ".join(text.split())
|
|
|
|
def parse_php_docs():
|
|
# Ensure files are fresh
|
|
for f in [TEXT_FILE, CODE_FILE]:
|
|
if os.path.exists(f): os.remove(f)
|
|
|
|
print("Starting extraction... this may take a minute.")
|
|
|
|
for root_dir, dirs, files in os.walk(SOURCE_DIR):
|
|
for file in files:
|
|
if file.endswith(".xml"):
|
|
file_path = os.path.join(root_dir, file)
|
|
|
|
try:
|
|
# We use recover=True because some XML entities might be missing
|
|
parser = etree.XMLParser(recover=True, remove_comments=False)
|
|
tree = etree.parse(file_path, parser)
|
|
root = tree.getroot()
|
|
|
|
# 1. Extract CODE Snippets
|
|
# <programlisting> usually contains the actual PHP examples
|
|
with open(CODE_FILE, 'a', encoding='utf-8') as f_code:
|
|
snippets = root.xpath("//programlisting")
|
|
for s in snippets:
|
|
code = s.text
|
|
if code:
|
|
f_code.write(f"--- Source: {file} ---\n")
|
|
f_code.write(code.strip() + "\n\n")
|
|
|
|
# 2. Extract TEXT Documentation
|
|
# Target paragraphs <para> and titles <title>
|
|
with open(TEXT_FILE, 'a', encoding='utf-8') as f_text:
|
|
# Get text but EXCLUDE text inside code blocks
|
|
paras = root.xpath("//para | //title | //simpara")
|
|
for p in paras:
|
|
# Ensure we don't grab text that is a descendant of a code block
|
|
if not p.xpath("ancestor::programlisting"):
|
|
txt = "".join(p.itertext()).strip()
|
|
if txt:
|
|
f_text.write(txt + "\n")
|
|
|
|
except Exception as e:
|
|
# Some files are meta-files/entities and might fail; we skip them
|
|
continue
|
|
|
|
print(f"Done! Created {TEXT_FILE} and {CODE_FILE}")
|
|
|
|
if __name__ == "__main__":
|
|
parse_php_docs()
|