init
This commit is contained in:
@@ -0,0 +1,61 @@
|
||||
import os
|
||||
from lxml import etree
|
||||
|
||||
# Configuration
|
||||
SOURCE_DIR = './doc-en' # Path to your cloned php/doc-en repo
|
||||
TEXT_FILE = 'php_text.txt'
|
||||
CODE_FILE = 'php_code.txt'
|
||||
COMMENTS_FILE = 'php_comments.txt' # Note: Basic placeholders from XML
|
||||
|
||||
def clean_text(text):
|
||||
if not text: return ""
|
||||
return " ".join(text.split())
|
||||
|
||||
def parse_php_docs():
|
||||
# Ensure files are fresh
|
||||
for f in [TEXT_FILE, CODE_FILE]:
|
||||
if os.path.exists(f): os.remove(f)
|
||||
|
||||
print("Starting extraction... this may take a minute.")
|
||||
|
||||
for root_dir, dirs, files in os.walk(SOURCE_DIR):
|
||||
for file in files:
|
||||
if file.endswith(".xml"):
|
||||
file_path = os.path.join(root_dir, file)
|
||||
|
||||
try:
|
||||
# We use recover=True because some XML entities might be missing
|
||||
parser = etree.XMLParser(recover=True, remove_comments=False)
|
||||
tree = etree.parse(file_path, parser)
|
||||
root = tree.getroot()
|
||||
|
||||
# 1. Extract CODE Snippets
|
||||
# <programlisting> usually contains the actual PHP examples
|
||||
with open(CODE_FILE, 'a', encoding='utf-8') as f_code:
|
||||
snippets = root.xpath("//programlisting")
|
||||
for s in snippets:
|
||||
code = s.text
|
||||
if code:
|
||||
f_code.write(f"--- Source: {file} ---\n")
|
||||
f_code.write(code.strip() + "\n\n")
|
||||
|
||||
# 2. Extract TEXT Documentation
|
||||
# Target paragraphs <para> and titles <title>
|
||||
with open(TEXT_FILE, 'a', encoding='utf-8') as f_text:
|
||||
# Get text but EXCLUDE text inside code blocks
|
||||
paras = root.xpath("//para | //title | //simpara")
|
||||
for p in paras:
|
||||
# Ensure we don't grab text that is a descendant of a code block
|
||||
if not p.xpath("ancestor::programlisting"):
|
||||
txt = "".join(p.itertext()).strip()
|
||||
if txt:
|
||||
f_text.write(txt + "\n")
|
||||
|
||||
except Exception as e:
|
||||
# Some files are meta-files/entities and might fail; we skip them
|
||||
continue
|
||||
|
||||
print(f"Done! Created {TEXT_FILE} and {CODE_FILE}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_php_docs()
|
||||
Reference in New Issue
Block a user