molenda.net/eeeeee/scraper.py

import os
from lxml import etree

# Configuration
SOURCE_DIR = './doc-en'  # Path to your cloned php/doc-en repo
TEXT_FILE = 'php_text.txt'
CODE_FILE = 'php_code.txt'
COMMENTS_FILE = 'php_comments.txt' # Note: Basic placeholders from XML

def clean_text(text):
    if not text: return ""
    return " ".join(text.split())

def parse_php_docs():
    # Ensure files are fresh
    for f in [TEXT_FILE, CODE_FILE]:
        if os.path.exists(f): os.remove(f)

    print("Starting extraction... this may take a minute.")

    for root_dir, dirs, files in os.walk(SOURCE_DIR):
        for file in files:
            if file.endswith(".xml"):
                file_path = os.path.join(root_dir, file)

                try:
                    # We use recover=True because some XML entities might be missing
                    parser = etree.XMLParser(recover=True, remove_comments=False)
                    tree = etree.parse(file_path, parser)
                    root = tree.getroot()

                    # 1. Extract CODE Snippets
                    # <programlisting> usually contains the actual PHP examples
                    with open(CODE_FILE, 'a', encoding='utf-8') as f_code:
                        snippets = root.xpath("//programlisting")
                        for s in snippets:
                            code = s.text
                            if code:
                                f_code.write(f"--- Source: {file} ---\n")
                                f_code.write(code.strip() + "\n\n")

                    # 2. Extract TEXT Documentation
                    # Target paragraphs <para> and titles <title>
                    with open(TEXT_FILE, 'a', encoding='utf-8') as f_text:
                        # Get text but EXCLUDE text inside code blocks
                        paras = root.xpath("//para | //title | //simpara")
                        for p in paras:
                            # Ensure we don't grab text that is a descendant of a code block
                            if not p.xpath("ancestor::programlisting"):
                                txt = "".join(p.itertext()).strip()
                                if txt:
                                    f_text.write(txt + "\n")

                except Exception as e:
                    # Some files are meta-files/entities and might fail; we skip them
                    continue

    print(f"Done! Created {TEXT_FILE} and {CODE_FILE}")

if __name__ == "__main__":
    parse_php_docs()