#!/usr/bin/env python3 """ Final cleanup script to ensure complete separation of bilingual documentation. """ import os import re import glob def clean_english_file(filepath): """Remove all Vietnamese content from English files.""" print(f"Cleaning English file: {filepath}") with open(filepath, 'r', encoding='utf-8') as f: lines = f.readlines() cleaned_lines = [] skip_mode = False for i, line in enumerate(lines): # Skip Vietnamese description lines if line.strip().startswith('>') and any(viet_word in line.lower() for viet_word in ['thực hành', 'mẫu', 'nền tảng', 'sử dụng', 'triển khai', 'bảo vệ', 'xác thực', 'giới hạn', 'quản lý', 'kiểm tra']): continue # Skip lines that are clearly Vietnamese content stripped = line.strip() if stripped and not stripped.startswith('#') and not stripped.startswith('>') and not stripped.startswith('```') and not stripped.startswith('|') and not stripped.startswith('-') and not stripped.startswith('*'): # Check if line contains Vietnamese characters or Vietnamese words vietnamese_indicators = [ 'các', 'cho', 'để', 'và', 'là', 'trong', 'với', 'này', 'có', 'không', 'từ', 'được', 'sẽ', 'nên', 'cần', 'khi', 'thì', 'lại', 'rất', 'đã', 'thực', 'hiện', 'dụng', 'bảo', 'mật', 'xác', 'thức', 'phân', 'quyền', 'dữ liệu', 'nhạy cảm', 'giới hạn', 'tốc độ', 'quản lý', 'bí mật', 'kiểm tra' ] if any(indicator in stripped.lower() for indicator in vietnamese_indicators): continue # Skip empty lines that follow Vietnamese content if not stripped and skip_mode: skip_mode = False continue cleaned_lines.append(line) # Write back the cleaned content with open(filepath, 'w', encoding='utf-8') as f: f.writelines(cleaned_lines) def clean_vietnamese_file(filepath): """Remove all English content from Vietnamese files.""" print(f"Cleaning Vietnamese file: {filepath}") with open(filepath, 'r', encoding='utf-8') as f: lines = f.readlines() cleaned_lines = [] skip_mode = False for i, line in enumerate(lines): # Skip English description lines if line.strip().startswith('>') and any(en_word in line.lower() for en_word in ['security', 'best practices', 'patterns', 'platform', 'use when', 'implementing', 'authentication']): continue # Skip lines that are clearly English content stripped = line.strip() if stripped and not stripped.startswith('#') and not stripped.startswith('>') and not stripped.startswith('```') and not stripped.startswith('|') and not stripped.startswith('-') and not stripped.startswith('*'): # Skip if line doesn't contain Vietnamese indicators vietnamese_indicators = [ 'các', 'cho', 'để', 'và', 'là', 'trong', 'với', 'này', 'có', 'không', 'từ', 'được', 'sẽ', 'nên', 'cần', 'khi', 'thì', 'lại', 'rất', 'đã', 'thực', 'hiện', 'dụng', 'bảo', 'mật', 'xác', 'thức', 'phân', 'quyền', 'dữ liệu', 'nhạy cảm', 'giới hạn', 'tốc độ', 'quản lý', 'bí mật', 'kiểm tra', 'tổng quan', 'khi nào sử dụng', 'khái niệm chính', 'patterns', 'cache' ] has_vietnamese = any(indicator in stripped.lower() for indicator in vietnamese_indicators) if not has_vietnamese and stripped and not stripped.startswith(('The', 'Use', 'This', 'For', 'In', 'When', 'How', 'What', 'Why')): continue cleaned_lines.append(line) # Write back the cleaned content with open(filepath, 'w', encoding='utf-8') as f: f.writelines(cleaned_lines) def main(): print("Starting final cleanup of bilingual documentation...") # Clean English files print("Cleaning English documentation files...") en_files = glob.glob('docs/en/skills/*.md') for filepath in en_files: if os.path.isfile(filepath): clean_english_file(filepath) # Clean Vietnamese files print("Cleaning Vietnamese documentation files...") vi_files = glob.glob('docs/vi/skills/*.md') for filepath in vi_files: if os.path.isfile(filepath): clean_vietnamese_file(filepath) print("Final cleanup completed!") if __name__ == '__main__': main()