pos-system/scripts/fix-bilingual-docs.py

#!/usr/bin/env python3
"""
Script to separate bilingual documentation into language-specific files.
Removes Vietnamese content from English docs and English content from Vietnamese docs.
"""

import os
import re
import glob

def process_english_file(filepath):
    """Process an English file to keep only English content."""
    print(f"Processing English file: {filepath}")

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # First, handle the description line - keep only English
    content = re.sub(r'^> \*\*EN\*\*: (.+)\n> \*\*VI\*\*: .+', r'> \1', content, flags=re.MULTILINE)

    # Remove standalone Vietnamese description lines
    content = re.sub(r'^> \*\*VI\*\*: .+\n', '', content, flags=re.MULTILINE)

    # Convert bilingual titles to English only
    content = re.sub(r'^(#{1,6}) .+ / (.+)$', r'\1 \2', content, flags=re.MULTILINE)

    # For sections with both EN and VI blocks, keep only EN content
    # Pattern: **EN**: content\n**VI**: content
    content = re.sub(r'\*\*EN\*\*: ([^\n]+)\n\*\*VI\*\*: [^\n]+', r'\1', content)

    # Remove remaining **VI** blocks and their content
    content = re.sub(r'\*\*VI\*\*:\n((?:(?!\*\*EN\*\*:|\*\*VI\*\*:|^#{1,6}).*\n)*)', '', content)

    # Remove standalone **VI**: lines
    content = re.sub(r'^\*\*VI\*\*: .+\n', '', content, flags=re.MULTILINE)

    # Remove **EN**: markers
    content = re.sub(r'\*\*EN\*\*: ', '', content)

    # Convert bilingual labels to English
    replacements = [
        ('**Reference / Tham Khảo**:', '**Reference**:'),
        ('**Patterns / Các Patterns**:', '**Patterns**:'),
        ('**Key Generators / Bộ Tạo Key**:', '**Key Generators**:'),
        ('**Speed / Tốc Độ**:', '**Speed**:'),
        ('**Capacity / Dung Lượng**:', '**Capacity**:'),
        ('**Scope / Phạm Vi**:', '**Scope**:'),
        ('**Use Case / Trường Hợp Sử Dụng**:', '**Use Case**:'),
        ('**Storage**:', '**Storage**:'),
        ('**L1 Cache (Memory) / Cache Bộ Nhớ**:', '**L1 Cache (Memory)**:'),
        ('**L2 Cache (Redis) / Cache Redis**:', '**L2 Cache (Redis)**:'),
    ]

    for old, new in replacements:
        content = content.replace(old, new)

    # Convert bilingual comments in code to English
    lines = content.split('\n')
    processed_lines = []
    for line in lines:
        if line.strip().startswith('//') and ' / ' in line:
            parts = line.split(' / ', 1)
            processed_lines.append(parts[0])
        else:
            processed_lines.append(line)

    content = '\n'.join(processed_lines)

    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

def process_vietnamese_file(filepath):
    """Process a Vietnamese file to keep only Vietnamese content."""
    print(f"Processing Vietnamese file: {filepath}")

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    # First, handle the description line - keep only Vietnamese
    content = re.sub(r'^> \*\*EN\*\*: .+\n> \*\*VI\*\*: (.+)', r'> \1', content, flags=re.MULTILINE)

    # Remove standalone English description lines
    content = re.sub(r'^> \*\*EN\*\*: .+\n', '', content, flags=re.MULTILINE)

    # Convert bilingual titles to Vietnamese only
    content = re.sub(r'^(#{1,6}) .+ / (.+)$', r'\1 \2', content, flags=re.MULTILINE)

    # For sections with both EN and VI blocks, keep only VI content
    # Pattern: **EN**: content\n**VI**: content
    content = re.sub(r'\*\*EN\*\*: [^\n]+\n\*\*VI\*\*: ([^\n]+)', r'\1', content)

    # Remove remaining **EN** blocks and their content
    content = re.sub(r'\*\*EN\*\*:\n((?:(?!\*\*EN\*\*:|\*\*VI\*\*:|^#{1,6}).*\n)*)', '', content)

    # Remove standalone **EN**: lines
    content = re.sub(r'^\*\*EN\*\*: .+\n', '', content, flags=re.MULTILINE)

    # Remove **VI**: markers
    content = re.sub(r'\*\*VI\*\*: ', '', content)

    # Convert bilingual labels to Vietnamese
    replacements = [
        ('**Reference / Tham Khảo**:', '**Tham Khảo**:'),
        ('**Patterns / Các Patterns**:', '**Các Patterns**:'),
        ('**Key Generators / Bộ Tạo Key**:', '**Bộ Tạo Key**:'),
        ('**Speed / Tốc Độ**:', '**Tốc Độ**:'),
        ('**Capacity / Dung Lượng**:', '**Dung Lượng**:'),
        ('**Scope / Phạm Vi**:', '**Phạm Vi**:'),
        ('**Use Case / Trường Hợp Sử Dụng**:', '**Trường Hợp Sử Dụng**:'),
        ('**Storage**:', '**Storage**:'),
        ('**L1 Cache (Memory) / Cache Bộ Nhớ**:', '**Cache Bộ Nhớ**:'),
        ('**L2 Cache (Redis) / Cache Redis**:', '**Cache Redis**:'),
    ]

    for old, new in replacements:
        content = content.replace(old, new)

    # Convert bilingual comments in code to Vietnamese
    lines = content.split('\n')
    processed_lines = []
    for line in lines:
        if line.strip().startswith('//') and ' / ' in line:
            parts = line.split(' / ', 1)
            if len(parts) == 2:
                processed_lines.append(f"// {parts[1]}")
        else:
            processed_lines.append(line)

    content = '\n'.join(processed_lines)

    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(content)

def main():
    print("Starting bilingual documentation separation...")

    # Process all English files
    print("Processing English documentation files...")
    en_files = glob.glob('docs/en/skills/*.md')
    for filepath in en_files:
        if os.path.isfile(filepath):
            process_english_file(filepath)

    # Process all Vietnamese files
    print("Processing Vietnamese documentation files...")
    vi_files = glob.glob('docs/vi/skills/*.md')
    for filepath in vi_files:
        if os.path.isfile(filepath):
            process_vietnamese_file(filepath)

    print("Bilingual documentation separation completed!")

if __name__ == '__main__':
    main()