Files
pos-system/scripts/fix-bilingual-docs.py
Ho Ngoc Hai 9b6c585f57 Enhance documentation structure and improve bilingual support across skills
- Updated skill documentation files to include structured metadata for better organization.
- Enhanced bilingual descriptions and guidelines for clarity in both English and Vietnamese.
- Refined sections on usage, best practices, and related skills to ensure consistency across all documentation.
- Improved formatting and removed outdated references to streamline the documentation experience.
- Added best practices checklists to relevant skills for better usability and adherence to standards.
2026-01-01 07:35:44 +07:00

154 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Script to separate bilingual documentation into language-specific files.
Removes Vietnamese content from English docs and English content from Vietnamese docs.
"""
import os
import re
import glob
def process_english_file(filepath):
"""Process an English file to keep only English content."""
print(f"Processing English file: {filepath}")
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# First, handle the description line - keep only English
content = re.sub(r'^> \*\*EN\*\*: (.+)\n> \*\*VI\*\*: .+', r'> \1', content, flags=re.MULTILINE)
# Remove standalone Vietnamese description lines
content = re.sub(r'^> \*\*VI\*\*: .+\n', '', content, flags=re.MULTILINE)
# Convert bilingual titles to English only
content = re.sub(r'^(#{1,6}) .+ / (.+)$', r'\1 \2', content, flags=re.MULTILINE)
# For sections with both EN and VI blocks, keep only EN content
# Pattern: **EN**: content\n**VI**: content
content = re.sub(r'\*\*EN\*\*: ([^\n]+)\n\*\*VI\*\*: [^\n]+', r'\1', content)
# Remove remaining **VI** blocks and their content
content = re.sub(r'\*\*VI\*\*:\n((?:(?!\*\*EN\*\*:|\*\*VI\*\*:|^#{1,6}).*\n)*)', '', content)
# Remove standalone **VI**: lines
content = re.sub(r'^\*\*VI\*\*: .+\n', '', content, flags=re.MULTILINE)
# Remove **EN**: markers
content = re.sub(r'\*\*EN\*\*: ', '', content)
# Convert bilingual labels to English
replacements = [
('**Reference / Tham Khảo**:', '**Reference**:'),
('**Patterns / Các Patterns**:', '**Patterns**:'),
('**Key Generators / Bộ Tạo Key**:', '**Key Generators**:'),
('**Speed / Tốc Độ**:', '**Speed**:'),
('**Capacity / Dung Lượng**:', '**Capacity**:'),
('**Scope / Phạm Vi**:', '**Scope**:'),
('**Use Case / Trường Hợp Sử Dụng**:', '**Use Case**:'),
('**Storage**:', '**Storage**:'),
('**L1 Cache (Memory) / Cache Bộ Nhớ**:', '**L1 Cache (Memory)**:'),
('**L2 Cache (Redis) / Cache Redis**:', '**L2 Cache (Redis)**:'),
]
for old, new in replacements:
content = content.replace(old, new)
# Convert bilingual comments in code to English
lines = content.split('\n')
processed_lines = []
for line in lines:
if line.strip().startswith('//') and ' / ' in line:
parts = line.split(' / ', 1)
processed_lines.append(parts[0])
else:
processed_lines.append(line)
content = '\n'.join(processed_lines)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
def process_vietnamese_file(filepath):
"""Process a Vietnamese file to keep only Vietnamese content."""
print(f"Processing Vietnamese file: {filepath}")
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
# First, handle the description line - keep only Vietnamese
content = re.sub(r'^> \*\*EN\*\*: .+\n> \*\*VI\*\*: (.+)', r'> \1', content, flags=re.MULTILINE)
# Remove standalone English description lines
content = re.sub(r'^> \*\*EN\*\*: .+\n', '', content, flags=re.MULTILINE)
# Convert bilingual titles to Vietnamese only
content = re.sub(r'^(#{1,6}) .+ / (.+)$', r'\1 \2', content, flags=re.MULTILINE)
# For sections with both EN and VI blocks, keep only VI content
# Pattern: **EN**: content\n**VI**: content
content = re.sub(r'\*\*EN\*\*: [^\n]+\n\*\*VI\*\*: ([^\n]+)', r'\1', content)
# Remove remaining **EN** blocks and their content
content = re.sub(r'\*\*EN\*\*:\n((?:(?!\*\*EN\*\*:|\*\*VI\*\*:|^#{1,6}).*\n)*)', '', content)
# Remove standalone **EN**: lines
content = re.sub(r'^\*\*EN\*\*: .+\n', '', content, flags=re.MULTILINE)
# Remove **VI**: markers
content = re.sub(r'\*\*VI\*\*: ', '', content)
# Convert bilingual labels to Vietnamese
replacements = [
('**Reference / Tham Khảo**:', '**Tham Khảo**:'),
('**Patterns / Các Patterns**:', '**Các Patterns**:'),
('**Key Generators / Bộ Tạo Key**:', '**Bộ Tạo Key**:'),
('**Speed / Tốc Độ**:', '**Tốc Độ**:'),
('**Capacity / Dung Lượng**:', '**Dung Lượng**:'),
('**Scope / Phạm Vi**:', '**Phạm Vi**:'),
('**Use Case / Trường Hợp Sử Dụng**:', '**Trường Hợp Sử Dụng**:'),
('**Storage**:', '**Storage**:'),
('**L1 Cache (Memory) / Cache Bộ Nhớ**:', '**Cache Bộ Nhớ**:'),
('**L2 Cache (Redis) / Cache Redis**:', '**Cache Redis**:'),
]
for old, new in replacements:
content = content.replace(old, new)
# Convert bilingual comments in code to Vietnamese
lines = content.split('\n')
processed_lines = []
for line in lines:
if line.strip().startswith('//') and ' / ' in line:
parts = line.split(' / ', 1)
if len(parts) == 2:
processed_lines.append(f"// {parts[1]}")
else:
processed_lines.append(line)
content = '\n'.join(processed_lines)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
def main():
print("Starting bilingual documentation separation...")
# Process all English files
print("Processing English documentation files...")
en_files = glob.glob('docs/en/skills/*.md')
for filepath in en_files:
if os.path.isfile(filepath):
process_english_file(filepath)
# Process all Vietnamese files
print("Processing Vietnamese documentation files...")
vi_files = glob.glob('docs/vi/skills/*.md')
for filepath in vi_files:
if os.path.isfile(filepath):
process_vietnamese_file(filepath)
print("Bilingual documentation separation completed!")
if __name__ == '__main__':
main()