pos-system/scripts/final-cleanup.py

#!/usr/bin/env python3
"""
Final cleanup script to ensure complete separation of bilingual documentation.
"""

import os
import re
import glob

def clean_english_file(filepath):
    """Remove all Vietnamese content from English files."""
    print(f"Cleaning English file: {filepath}")

    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    skip_mode = False

    for i, line in enumerate(lines):
        # Skip Vietnamese description lines
        if line.strip().startswith('>') and any(viet_word in line.lower() for viet_word in ['thực hành', 'mẫu', 'nền tảng', 'sử dụng', 'triển khai', 'bảo vệ', 'xác thực', 'giới hạn', 'quản lý', 'kiểm tra']):
            continue

        # Skip lines that are clearly Vietnamese content
        stripped = line.strip()
        if stripped and not stripped.startswith('#') and not stripped.startswith('>') and not stripped.startswith('```') and not stripped.startswith('|') and not stripped.startswith('-') and not stripped.startswith('*'):
            # Check if line contains Vietnamese characters or Vietnamese words
            vietnamese_indicators = [
                'các', 'cho', 'để', 'và', 'là', 'trong', 'với', 'này', 'có', 'không',
                'từ', 'được', 'sẽ', 'nên', 'cần', 'khi', 'thì', 'lại', 'rất', 'đã',
                'thực', 'hiện', 'dụng', 'bảo', 'mật', 'xác', 'thức', 'phân', 'quyền',
                'dữ liệu', 'nhạy cảm', 'giới hạn', 'tốc độ', 'quản lý', 'bí mật', 'kiểm tra'
            ]
            if any(indicator in stripped.lower() for indicator in vietnamese_indicators):
                continue

        # Skip empty lines that follow Vietnamese content
        if not stripped and skip_mode:
            skip_mode = False
            continue

        cleaned_lines.append(line)

    # Write back the cleaned content
    with open(filepath, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_lines)

def clean_vietnamese_file(filepath):
    """Remove all English content from Vietnamese files."""
    print(f"Cleaning Vietnamese file: {filepath}")

    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    skip_mode = False

    for i, line in enumerate(lines):
        # Skip English description lines
        if line.strip().startswith('>') and any(en_word in line.lower() for en_word in ['security', 'best practices', 'patterns', 'platform', 'use when', 'implementing', 'authentication']):
            continue

        # Skip lines that are clearly English content
        stripped = line.strip()
        if stripped and not stripped.startswith('#') and not stripped.startswith('>') and not stripped.startswith('```') and not stripped.startswith('|') and not stripped.startswith('-') and not stripped.startswith('*'):
            # Skip if line doesn't contain Vietnamese indicators
            vietnamese_indicators = [
                'các', 'cho', 'để', 'và', 'là', 'trong', 'với', 'này', 'có', 'không',
                'từ', 'được', 'sẽ', 'nên', 'cần', 'khi', 'thì', 'lại', 'rất', 'đã',
                'thực', 'hiện', 'dụng', 'bảo', 'mật', 'xác', 'thức', 'phân', 'quyền',
                'dữ liệu', 'nhạy cảm', 'giới hạn', 'tốc độ', 'quản lý', 'bí mật', 'kiểm tra',
                'tổng quan', 'khi nào sử dụng', 'khái niệm chính', 'patterns', 'cache'
            ]
            has_vietnamese = any(indicator in stripped.lower() for indicator in vietnamese_indicators)
            if not has_vietnamese and stripped and not stripped.startswith(('The', 'Use', 'This', 'For', 'In', 'When', 'How', 'What', 'Why')):
                continue

        cleaned_lines.append(line)

    # Write back the cleaned content
    with open(filepath, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_lines)

def main():
    print("Starting final cleanup of bilingual documentation...")

    # Clean English files
    print("Cleaning English documentation files...")
    en_files = glob.glob('docs/en/skills/*.md')
    for filepath in en_files:
        if os.path.isfile(filepath):
            clean_english_file(filepath)

    # Clean Vietnamese files
    print("Cleaning Vietnamese documentation files...")
    vi_files = glob.glob('docs/vi/skills/*.md')
    for filepath in vi_files:
        if os.path.isfile(filepath):
            clean_vietnamese_file(filepath)

    print("Final cleanup completed!")

if __name__ == '__main__':
    main()