import os from lxml import etree # Configuration SOURCE_DIR = './doc-en' # Path to your cloned php/doc-en repo TEXT_FILE = 'php_text.txt' CODE_FILE = 'php_code.txt' COMMENTS_FILE = 'php_comments.txt' # Note: Basic placeholders from XML def clean_text(text): if not text: return "" return " ".join(text.split()) def parse_php_docs(): # Ensure files are fresh for f in [TEXT_FILE, CODE_FILE]: if os.path.exists(f): os.remove(f) print("Starting extraction... this may take a minute.") for root_dir, dirs, files in os.walk(SOURCE_DIR): for file in files: if file.endswith(".xml"): file_path = os.path.join(root_dir, file) try: # We use recover=True because some XML entities might be missing parser = etree.XMLParser(recover=True, remove_comments=False) tree = etree.parse(file_path, parser) root = tree.getroot() # 1. Extract CODE Snippets # usually contains the actual PHP examples with open(CODE_FILE, 'a', encoding='utf-8') as f_code: snippets = root.xpath("//programlisting") for s in snippets: code = s.text if code: f_code.write(f"--- Source: {file} ---\n") f_code.write(code.strip() + "\n\n") # 2. Extract TEXT Documentation # Target paragraphs and titles with open(TEXT_FILE, 'a', encoding='utf-8') as f_text: # Get text but EXCLUDE text inside code blocks paras = root.xpath("//para | //title | //simpara") for p in paras: # Ensure we don't grab text that is a descendant of a code block if not p.xpath("ancestor::programlisting"): txt = "".join(p.itertext()).strip() if txt: f_text.write(txt + "\n") except Exception as e: # Some files are meta-files/entities and might fail; we skip them continue print(f"Done! Created {TEXT_FILE} and {CODE_FILE}") if __name__ == "__main__": parse_php_docs()