#!/usr/bin/env python3 """ Script to separate bilingual documentation into language-specific files. Removes Vietnamese content from English docs and English content from Vietnamese docs. """ import os import re import glob def process_english_file(filepath): """Process an English file to keep only English content.""" print(f"Processing English file: {filepath}") with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # First, handle the description line - keep only English content = re.sub(r'^> \*\*EN\*\*: (.+)\n> \*\*VI\*\*: .+', r'> \1', content, flags=re.MULTILINE) # Remove standalone Vietnamese description lines content = re.sub(r'^> \*\*VI\*\*: .+\n', '', content, flags=re.MULTILINE) # Convert bilingual titles to English only content = re.sub(r'^(#{1,6}) .+ / (.+)$', r'\1 \2', content, flags=re.MULTILINE) # For sections with both EN and VI blocks, keep only EN content # Pattern: **EN**: content\n**VI**: content content = re.sub(r'\*\*EN\*\*: ([^\n]+)\n\*\*VI\*\*: [^\n]+', r'\1', content) # Remove remaining **VI** blocks and their content content = re.sub(r'\*\*VI\*\*:\n((?:(?!\*\*EN\*\*:|\*\*VI\*\*:|^#{1,6}).*\n)*)', '', content) # Remove standalone **VI**: lines content = re.sub(r'^\*\*VI\*\*: .+\n', '', content, flags=re.MULTILINE) # Remove **EN**: markers content = re.sub(r'\*\*EN\*\*: ', '', content) # Convert bilingual labels to English replacements = [ ('**Reference / Tham Khảo**:', '**Reference**:'), ('**Patterns / Các Patterns**:', '**Patterns**:'), ('**Key Generators / Bộ Tạo Key**:', '**Key Generators**:'), ('**Speed / Tốc Độ**:', '**Speed**:'), ('**Capacity / Dung Lượng**:', '**Capacity**:'), ('**Scope / Phạm Vi**:', '**Scope**:'), ('**Use Case / Trường Hợp Sử Dụng**:', '**Use Case**:'), ('**Storage**:', '**Storage**:'), ('**L1 Cache (Memory) / Cache Bộ Nhớ**:', '**L1 Cache (Memory)**:'), ('**L2 Cache (Redis) / Cache Redis**:', '**L2 Cache (Redis)**:'), ] for old, new in replacements: content = content.replace(old, new) # Convert bilingual comments in code to English lines = content.split('\n') processed_lines = [] for line in lines: if line.strip().startswith('//') and ' / ' in line: parts = line.split(' / ', 1) processed_lines.append(parts[0]) else: processed_lines.append(line) content = '\n'.join(processed_lines) with open(filepath, 'w', encoding='utf-8') as f: f.write(content) def process_vietnamese_file(filepath): """Process a Vietnamese file to keep only Vietnamese content.""" print(f"Processing Vietnamese file: {filepath}") with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # First, handle the description line - keep only Vietnamese content = re.sub(r'^> \*\*EN\*\*: .+\n> \*\*VI\*\*: (.+)', r'> \1', content, flags=re.MULTILINE) # Remove standalone English description lines content = re.sub(r'^> \*\*EN\*\*: .+\n', '', content, flags=re.MULTILINE) # Convert bilingual titles to Vietnamese only content = re.sub(r'^(#{1,6}) .+ / (.+)$', r'\1 \2', content, flags=re.MULTILINE) # For sections with both EN and VI blocks, keep only VI content # Pattern: **EN**: content\n**VI**: content content = re.sub(r'\*\*EN\*\*: [^\n]+\n\*\*VI\*\*: ([^\n]+)', r'\1', content) # Remove remaining **EN** blocks and their content content = re.sub(r'\*\*EN\*\*:\n((?:(?!\*\*EN\*\*:|\*\*VI\*\*:|^#{1,6}).*\n)*)', '', content) # Remove standalone **EN**: lines content = re.sub(r'^\*\*EN\*\*: .+\n', '', content, flags=re.MULTILINE) # Remove **VI**: markers content = re.sub(r'\*\*VI\*\*: ', '', content) # Convert bilingual labels to Vietnamese replacements = [ ('**Reference / Tham Khảo**:', '**Tham Khảo**:'), ('**Patterns / Các Patterns**:', '**Các Patterns**:'), ('**Key Generators / Bộ Tạo Key**:', '**Bộ Tạo Key**:'), ('**Speed / Tốc Độ**:', '**Tốc Độ**:'), ('**Capacity / Dung Lượng**:', '**Dung Lượng**:'), ('**Scope / Phạm Vi**:', '**Phạm Vi**:'), ('**Use Case / Trường Hợp Sử Dụng**:', '**Trường Hợp Sử Dụng**:'), ('**Storage**:', '**Storage**:'), ('**L1 Cache (Memory) / Cache Bộ Nhớ**:', '**Cache Bộ Nhớ**:'), ('**L2 Cache (Redis) / Cache Redis**:', '**Cache Redis**:'), ] for old, new in replacements: content = content.replace(old, new) # Convert bilingual comments in code to Vietnamese lines = content.split('\n') processed_lines = [] for line in lines: if line.strip().startswith('//') and ' / ' in line: parts = line.split(' / ', 1) if len(parts) == 2: processed_lines.append(f"// {parts[1]}") else: processed_lines.append(line) content = '\n'.join(processed_lines) with open(filepath, 'w', encoding='utf-8') as f: f.write(content) def main(): print("Starting bilingual documentation separation...") # Process all English files print("Processing English documentation files...") en_files = glob.glob('docs/en/skills/*.md') for filepath in en_files: if os.path.isfile(filepath): process_english_file(filepath) # Process all Vietnamese files print("Processing Vietnamese documentation files...") vi_files = glob.glob('docs/vi/skills/*.md') for filepath in vi_files: if os.path.isfile(filepath): process_vietnamese_file(filepath) print("Bilingual documentation separation completed!") if __name__ == '__main__': main()