feat(ai-services): add Vietnamese NLP pipeline for property description analysis

Implement auto-tagging (amenities, location features, condition/legal), content quality scoring with moderation integration, and FastAPI endpoints for single and batch text analysis. Uses underthesea for Vietnamese tokenization/POS when available, with regex fallback. Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-08 22:42:31 +07:00
parent 944d6262e7
commit ee3ae2e81d
5 changed files with 431 additions and 1 deletions
--- a/libs/ai-services/app/main.py
+++ b/libs/ai-services/app/main.py
@@ -6,7 +6,7 @@ from slowapi.util import get_remote_address

 from app.config import settings
 from app.middleware import verify_api_key
-from app.routers import avm, moderation
+from app.routers import avm, moderation, nlp

 limiter = Limiter(key_func=get_remote_address, default_limits=[settings.rate_limit])

@@ -33,6 +33,7 @@ app.add_middleware(

 app.include_router(avm.router)
 app.include_router(moderation.router)
+app.include_router(nlp.router)


@app.get("/health")
--- a/libs/ai-services/app/models/nlp.py
+++ b/libs/ai-services/app/models/nlp.py
@@ -0,0 +1,48 @@
+from pydantic import BaseModel, Field
+
+
+class NLPAnalyzeRequest(BaseModel):
+    text: str = Field(..., min_length=1, description="Vietnamese property description text")
+    include_moderation: bool = Field(
+        True, description="Whether to include moderation quality score"
+    )
+
+
+class PropertyTag(BaseModel):
+    category: str = Field(..., description="Tag category: amenity, location, condition, legal")
+    tag: str = Field(..., description="Normalized tag name")
+    matched_text: str = Field(..., description="Original text that matched")
+    confidence: float = Field(..., ge=0, le=1, description="Match confidence")
+
+
+class QualityScore(BaseModel):
+    overall: float = Field(..., ge=0, le=1, description="Overall content quality 0-1")
+    completeness: float = Field(
+        ..., ge=0, le=1, description="How complete the listing info is"
+    )
+    readability: float = Field(..., ge=0, le=1, description="Text readability score")
+    information_density: float = Field(
+        ..., ge=0, le=1, description="Ratio of useful info to total text"
+    )
+    moderation_score: float | None = Field(
+        None, ge=0, le=1, description="Moderation risk score (0=safe, 1=risky)"
+    )
+
+
+class NLPAnalyzeResponse(BaseModel):
+    tags: list[PropertyTag] = Field(default_factory=list)
+    quality: QualityScore
+    tokens: list[str] = Field(default_factory=list, description="Word-segmented tokens")
+    sentences: list[str] = Field(default_factory=list, description="Sentence-split results")
+    keyword_phrases: list[str] = Field(
+        default_factory=list, description="Key noun phrases extracted"
+    )
+
+
+class BatchAnalyzeRequest(BaseModel):
+    texts: list[str] = Field(..., min_length=1, max_length=50, description="Batch of texts")
+    include_moderation: bool = Field(True)
+
+
+class BatchAnalyzeResponse(BaseModel):
+    results: list[NLPAnalyzeResponse]
--- a/libs/ai-services/app/routers/nlp.py
+++ b/libs/ai-services/app/routers/nlp.py
@@ -0,0 +1,27 @@
+from fastapi import APIRouter
+
+from app.models.nlp import (
+    BatchAnalyzeRequest,
+    BatchAnalyzeResponse,
+    NLPAnalyzeRequest,
+    NLPAnalyzeResponse,
+)
+from app.services.nlp_service import nlp_service
+
+router = APIRouter(prefix="/nlp", tags=["NLP"])
+
+
+@router.post("/analyze", response_model=NLPAnalyzeResponse)
+def analyze(req: NLPAnalyzeRequest) -> NLPAnalyzeResponse:
+    """Analyze Vietnamese property description: auto-tag, quality score, tokenize."""
+    return nlp_service.analyze(req)
+
+
+@router.post("/batch-analyze", response_model=BatchAnalyzeResponse)
+def batch_analyze(req: BatchAnalyzeRequest) -> BatchAnalyzeResponse:
+    """Batch analyze multiple property descriptions."""
+    results = [
+        nlp_service.analyze(NLPAnalyzeRequest(text=t, include_moderation=req.include_moderation))
+        for t in req.texts
+    ]
+    return BatchAnalyzeResponse(results=results)
--- a/libs/ai-services/app/services/nlp_service.py
+++ b/libs/ai-services/app/services/nlp_service.py
@@ -0,0 +1,235 @@
+import logging
+import re
+
+from app.models.nlp import (
+    NLPAnalyzeRequest,
+    NLPAnalyzeResponse,
+    PropertyTag,
+    QualityScore,
+)
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Tag dictionaries — Vietnamese real-estate domain
+# ---------------------------------------------------------------------------
+
+AMENITY_TAGS: dict[str, list[str]] = {
+    "hồ bơi": ["hồ bơi", "bể bơi", "pool"],
+    "phòng gym": ["phòng gym", "gym", "phòng tập"],
+    "sân vườn": ["sân vườn", "vườn", "garden"],
+    "gara ô tô": ["gara", "garage", "nhà xe", "chỗ đậu xe"],
+    "thang máy": ["thang máy", "elevator"],
+    "ban công": ["ban công", "balcony", "lô gia", "logia"],
+    "sân thượng": ["sân thượng", "rooftop"],
+    "bảo vệ 24/7": ["bảo vệ 24", "an ninh 24", "security 24"],
+    "khu vui chơi": ["khu vui chơi", "playground", "sân chơi trẻ em"],
+    "hầm để xe": ["hầm để xe", "hầm xe", "tầng hầm"],
+    "điều hòa": ["điều hòa", "máy lạnh"],
+    "nội thất cao cấp": ["nội thất cao cấp", "full nội thất", "nội thất đầy đủ"],
+    "camera an ninh": ["camera", "camera an ninh"],
+    "sân tennis": ["sân tennis", "tennis"],
+    "công viên nội khu": ["công viên nội khu", "công viên", "park"],
+}
+
+LOCATION_TAGS: dict[str, list[str]] = {
+    "gần trường học": ["gần trường", "cạnh trường", "kế trường"],
+    "gần bệnh viện": ["gần bệnh viện", "cạnh bệnh viện", "kế bệnh viện"],
+    "gần chợ": ["gần chợ", "cạnh chợ", "kế chợ"],
+    "gần siêu thị": ["gần siêu thị", "cạnh siêu thị", "siêu thị"],
+    "mặt tiền đường": ["mặt tiền", "mặt đường", "mặt phố"],
+    "gần metro": ["gần metro", "gần tàu điện", "cạnh metro"],
+    "gần sân bay": ["gần sân bay", "cạnh sân bay"],
+    "trung tâm thành phố": ["trung tâm", "trung tâm thành phố", "trung tâm tp"],
+    "ven sông": ["ven sông", "view sông", "mặt sông"],
+    "gần biển": ["gần biển", "view biển", "mặt biển", "sát biển"],
+    "gần công viên": ["gần công viên", "cạnh công viên"],
+    "khu dân cư": ["khu dân cư", "kdc"],
+}
+
+CONDITION_TAGS: dict[str, list[str]] = {
+    "mới xây": ["mới xây", "xây mới", "mới hoàn thiện", "vừa xây xong"],
+    "đã qua sử dụng": ["đã qua sử dụng", "đã sử dụng"],
+    "cần sửa chữa": ["cần sửa", "cần cải tạo", "cần nâng cấp", "sửa chữa"],
+    "đang xây dựng": ["đang xây", "đang thi công", "sắp bàn giao"],
+    "sổ đỏ": ["sổ đỏ"],
+    "sổ hồng": ["sổ hồng"],
+    "chính chủ": ["chính chủ"],
+    "pháp lý rõ ràng": ["pháp lý rõ", "pháp lý đầy đủ", "pháp lý sạch"],
+    "hoàn thiện cơ bản": ["hoàn thiện cơ bản", "bàn giao thô"],
+    "đầy đủ nội thất": ["đầy đủ nội thất", "full nội thất", "nội thất đầy đủ"],
+}
+
+# Categories for completeness scoring — which info fields are expected
+_COMPLETENESS_FIELDS = [
+    r"\d+(?:[.,]\d+)?\s*(?:m2|m²|mét vuông)",  # area
+    r"\d+\s*(?:phòng ngủ|pn|PN)",  # bedrooms
+    r"\d+\s*(?:tầng|lầu)",  # floors
+    r"(?:sổ đỏ|sổ hồng|chính chủ|pháp lý)",  # legal
+    r"\d+(?:[.,]\d+)?\s*(?:tỷ|tỉ|triệu)",  # price
+    r"(?:quận|huyện|phường|xã|đường|tp\.|thành phố)",  # location
+    r"(?:căn hộ|chung cư|nhà phố|biệt thự|đất|shophouse|nhà riêng)",  # property type
+]
+
+
+class NLPService:
+    """Vietnamese NLP pipeline for property description analysis."""
+
+    def _match_tags(
+        self,
+        text_lower: str,
+        tag_dict: dict[str, list[str]],
+        category: str,
+    ) -> list[PropertyTag]:
+        tags: list[PropertyTag] = []
+        seen: set[str] = set()
+        for tag_name, keywords in tag_dict.items():
+            for kw in keywords:
+                idx = text_lower.find(kw)
+                if idx != -1 and tag_name not in seen:
+                    seen.add(tag_name)
+                    # Extract actual matched text from original-case vicinity
+                    matched = text_lower[idx : idx + len(kw)]
+                    tags.append(
+                        PropertyTag(
+                            category=category,
+                            tag=tag_name,
+                            matched_text=matched,
+                            confidence=0.9 if len(kw) > 3 else 0.75,
+                        )
+                    )
+        return tags
+
+    def _compute_completeness(self, text: str) -> float:
+        matched = sum(
+            1 for pattern in _COMPLETENESS_FIELDS if re.search(pattern, text, re.IGNORECASE)
+        )
+        return round(matched / len(_COMPLETENESS_FIELDS), 3)
+
+    def _compute_readability(self, text: str) -> float:
+        sentences = [s.strip() for s in re.split(r"[.!?;\n]+", text) if s.strip()]
+        if not sentences:
+            return 0.0
+
+        avg_sentence_len = sum(len(s.split()) for s in sentences) / len(sentences)
+
+        # Penalize very short or very long sentences
+        if avg_sentence_len < 3:
+            score = 0.4
+        elif avg_sentence_len > 40:
+            score = 0.5
+        else:
+            # Sweet spot: 8-20 words per sentence
+            score = 1.0 - abs(avg_sentence_len - 14) / 30
+            score = max(0.3, min(1.0, score))
+
+        # Penalize excessive caps or repeated punctuation
+        caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
+        if caps_ratio > 0.3:
+            score *= 0.7
+
+        return round(score, 3)
+
+    def _compute_info_density(self, text: str, tag_count: int) -> float:
+        word_count = len(text.split())
+        if word_count == 0:
+            return 0.0
+        # More tags per word = higher density, capped at 1.0
+        density = min(1.0, (tag_count * 5) / word_count)
+        return round(density, 3)
+
+    def _tokenize(self, text: str) -> list[str]:
+        try:
+            from underthesea import word_tokenize
+
+            return word_tokenize(text)
+        except ImportError:
+            logger.warning("underthesea not available — falling back to whitespace split")
+            return text.split()
+
+    def _sentence_split(self, text: str) -> list[str]:
+        try:
+            from underthesea import sent_tokenize
+
+            return sent_tokenize(text)
+        except ImportError:
+            return [s.strip() for s in re.split(r"[.!?\n]+", text) if s.strip()]
+
+    def _extract_noun_phrases(self, tokens: list[str], text: str) -> list[str]:
+        """Extract key noun phrases using POS tagging when available."""
+        try:
+            from underthesea import pos_tag
+
+            tagged = pos_tag(text)
+            phrases: list[str] = []
+            current_phrase: list[str] = []
+
+            for word, pos in tagged:
+                if pos in ("N", "Np", "Nc", "Nu", "A"):
+                    current_phrase.append(word)
+                else:
+                    if len(current_phrase) >= 2:
+                        phrases.append(" ".join(current_phrase))
+                    current_phrase = []
+
+            if len(current_phrase) >= 2:
+                phrases.append(" ".join(current_phrase))
+
+            return phrases[:20]  # Cap at 20 phrases
+        except ImportError:
+            return []
+
+    def analyze(self, req: NLPAnalyzeRequest) -> NLPAnalyzeResponse:
+        text = req.text
+        text_lower = text.lower()
+
+        # Auto-tag
+        amenity_tags = self._match_tags(text_lower, AMENITY_TAGS, "amenity")
+        location_tags = self._match_tags(text_lower, LOCATION_TAGS, "location")
+        condition_tags = self._match_tags(text_lower, CONDITION_TAGS, "condition")
+        all_tags = amenity_tags + location_tags + condition_tags
+
+        # Tokenization
+        tokens = self._tokenize(text)
+        sentences = self._sentence_split(text)
+        keyword_phrases = self._extract_noun_phrases(tokens, text)
+
+        # Quality scoring
+        completeness = self._compute_completeness(text)
+        readability = self._compute_readability(text)
+        info_density = self._compute_info_density(text, len(all_tags))
+
+        # Moderation integration
+        moderation_score: float | None = None
+        if req.include_moderation:
+            from app.services.moderation_service import moderation_service
+            from app.models.moderation import ModerationRequest
+
+            mod_result = moderation_service.check(ModerationRequest(text=text, context="listing"))
+            moderation_score = mod_result.score
+
+        # Overall quality: weighted combination
+        mod_penalty = (1 - moderation_score) if moderation_score is not None else 1.0
+        overall = round(
+            (completeness * 0.4 + readability * 0.3 + info_density * 0.3) * mod_penalty,
+            3,
+        )
+
+        quality = QualityScore(
+            overall=overall,
+            completeness=completeness,
+            readability=readability,
+            information_density=info_density,
+            moderation_score=moderation_score,
+        )
+
+        return NLPAnalyzeResponse(
+            tags=all_tags,
+            quality=quality,
+            tokens=tokens,
+            sentences=sentences,
+            keyword_phrases=keyword_phrases,
+        )
+
+
+nlp_service = NLPService()
--- a/libs/ai-services/tests/test_nlp.py
+++ b/libs/ai-services/tests/test_nlp.py
@@ -0,0 +1,119 @@
+from fastapi.testclient import TestClient
+
+from app.main import app
+
+client = TestClient(app)
+
+SAMPLE_LISTING = (
+    "Bán căn hộ chung cư cao cấp 85m² tại quận 7, 2 phòng ngủ, 2 WC, "
+    "3 tầng, nội thất đầy đủ. Có hồ bơi, phòng gym, bảo vệ 24/7. "
+    "Gần trường học và siêu thị. Sổ hồng chính chủ. Giá 3.5 tỷ."
+)
+
+
+def test_analyze_returns_tags():
+    resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
+    assert resp.status_code == 200
+    data = resp.json()
+
+    tags = data["tags"]
+    tag_names = [t["tag"] for t in tags]
+
+    # Amenities
+    assert "hồ bơi" in tag_names
+    assert "phòng gym" in tag_names
+    assert "bảo vệ 24/7" in tag_names
+
+    # Location
+    assert "gần trường học" in tag_names
+    assert "gần siêu thị" in tag_names
+
+    # Condition / legal
+    assert "sổ hồng" in tag_names
+    assert "chính chủ" in tag_names
+
+
+def test_analyze_quality_scores():
+    resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
+    assert resp.status_code == 200
+    quality = resp.json()["quality"]
+
+    assert 0 < quality["overall"] <= 1
+    assert 0 < quality["completeness"] <= 1
+    assert 0 < quality["readability"] <= 1
+    assert 0 < quality["information_density"] <= 1
+    assert quality["moderation_score"] is not None
+    assert quality["moderation_score"] == 0.0  # clean listing
+
+
+def test_analyze_completeness_low_for_sparse_text():
+    resp = client.post("/nlp/analyze", json={"text": "Bán nhà đẹp giá tốt"})
+    assert resp.status_code == 200
+    quality = resp.json()["quality"]
+    assert quality["completeness"] < 0.3
+
+
+def test_analyze_tokens_present():
+    resp = client.post("/nlp/analyze", json={"text": SAMPLE_LISTING})
+    assert resp.status_code == 200
+    data = resp.json()
+    assert len(data["tokens"]) > 0
+    assert len(data["sentences"]) > 0
+
+
+def test_analyze_no_moderation():
+    resp = client.post(
+        "/nlp/analyze",
+        json={"text": SAMPLE_LISTING, "include_moderation": False},
+    )
+    assert resp.status_code == 200
+    quality = resp.json()["quality"]
+    assert quality["moderation_score"] is None
+
+
+def test_analyze_flagged_content_reduces_quality():
+    flagged_text = (
+        "Bán căn hộ 80m² 2 phòng ngủ quận 1. Liên hệ 0912345678. "
+        "Sổ hồng chính chủ. Giá 5 tỷ."
+    )
+    resp = client.post("/nlp/analyze", json={"text": flagged_text})
+    assert resp.status_code == 200
+    quality = resp.json()["quality"]
+    assert quality["moderation_score"] > 0  # phone number flagged
+
+
+def test_batch_analyze():
+    resp = client.post(
+        "/nlp/batch-analyze",
+        json={
+            "texts": [
+                "Bán căn hộ 60m² có hồ bơi gần trường học. Sổ đỏ. 2 tỷ.",
+                "Bán đất nền 200m² mặt tiền đường lớn. Pháp lý rõ ràng.",
+            ],
+            "include_moderation": True,
+        },
+    )
+    assert resp.status_code == 200
+    data = resp.json()
+    assert len(data["results"]) == 2
+    assert any(t["tag"] == "hồ bơi" for t in data["results"][0]["tags"])
+    assert any(t["tag"] == "mặt tiền đường" for t in data["results"][1]["tags"])
+
+
+def test_analyze_location_tags():
+    text = "Căn hộ ven sông Sài Gòn, gần metro số 1, trung tâm thành phố."
+    resp = client.post("/nlp/analyze", json={"text": text})
+    assert resp.status_code == 200
+    tag_names = [t["tag"] for t in resp.json()["tags"]]
+    assert "ven sông" in tag_names
+    assert "gần metro" in tag_names
+    assert "trung tâm thành phố" in tag_names
+
+
+def test_analyze_condition_tags():
+    text = "Nhà mới xây, hoàn thiện cơ bản, 3 tầng, đang thi công sắp bàn giao."
+    resp = client.post("/nlp/analyze", json={"text": text})
+    assert resp.status_code == 200
+    tag_names = [t["tag"] for t in resp.json()["tags"]]
+    assert "mới xây" in tag_names
+    assert "hoàn thiện cơ bản" in tag_names