goodgo-platform/libs/ai-services/app/services/nlp_service.py

import logging
import re

from app.models.nlp import (
    NLPAnalyzeRequest,
    NLPAnalyzeResponse,
    PropertyTag,
    QualityScore,
)

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Tag dictionaries — Vietnamese real-estate domain
# ---------------------------------------------------------------------------

AMENITY_TAGS: dict[str, list[str]] = {
    "hồ bơi": ["hồ bơi", "bể bơi", "pool"],
    "phòng gym": ["phòng gym", "gym", "phòng tập"],
    "sân vườn": ["sân vườn", "vườn", "garden"],
    "gara ô tô": ["gara", "garage", "nhà xe", "chỗ đậu xe"],
    "thang máy": ["thang máy", "elevator"],
    "ban công": ["ban công", "balcony", "lô gia", "logia"],
    "sân thượng": ["sân thượng", "rooftop"],
    "bảo vệ 24/7": ["bảo vệ 24", "an ninh 24", "security 24"],
    "khu vui chơi": ["khu vui chơi", "playground", "sân chơi trẻ em"],
    "hầm để xe": ["hầm để xe", "hầm xe", "tầng hầm"],
    "điều hòa": ["điều hòa", "máy lạnh"],
    "nội thất cao cấp": ["nội thất cao cấp", "full nội thất", "nội thất đầy đủ"],
    "camera an ninh": ["camera", "camera an ninh"],
    "sân tennis": ["sân tennis", "tennis"],
    "công viên nội khu": ["công viên nội khu", "công viên", "park"],
}

LOCATION_TAGS: dict[str, list[str]] = {
    "gần trường học": ["gần trường", "cạnh trường", "kế trường"],
    "gần bệnh viện": ["gần bệnh viện", "cạnh bệnh viện", "kế bệnh viện"],
    "gần chợ": ["gần chợ", "cạnh chợ", "kế chợ"],
    "gần siêu thị": ["gần siêu thị", "cạnh siêu thị", "siêu thị"],
    "mặt tiền đường": ["mặt tiền", "mặt đường", "mặt phố"],
    "gần metro": ["gần metro", "gần tàu điện", "cạnh metro"],
    "gần sân bay": ["gần sân bay", "cạnh sân bay"],
    "trung tâm thành phố": ["trung tâm", "trung tâm thành phố", "trung tâm tp"],
    "ven sông": ["ven sông", "view sông", "mặt sông"],
    "gần biển": ["gần biển", "view biển", "mặt biển", "sát biển"],
    "gần công viên": ["gần công viên", "cạnh công viên"],
    "khu dân cư": ["khu dân cư", "kdc"],
}

CONDITION_TAGS: dict[str, list[str]] = {
    "mới xây": ["mới xây", "xây mới", "mới hoàn thiện", "vừa xây xong"],
    "đã qua sử dụng": ["đã qua sử dụng", "đã sử dụng"],
    "cần sửa chữa": ["cần sửa", "cần cải tạo", "cần nâng cấp", "sửa chữa"],
    "đang xây dựng": ["đang xây", "đang thi công", "sắp bàn giao"],
    "sổ đỏ": ["sổ đỏ"],
    "sổ hồng": ["sổ hồng"],
    "chính chủ": ["chính chủ"],
    "pháp lý rõ ràng": ["pháp lý rõ", "pháp lý đầy đủ", "pháp lý sạch"],
    "hoàn thiện cơ bản": ["hoàn thiện cơ bản", "bàn giao thô"],
    "đầy đủ nội thất": ["đầy đủ nội thất", "full nội thất", "nội thất đầy đủ"],
}

# Categories for completeness scoring — which info fields are expected
_COMPLETENESS_FIELDS = [
    r"\d+(?:[.,]\d+)?\s*(?:m2|m²|mét vuông)",  # area
    r"\d+\s*(?:phòng ngủ|pn|PN)",  # bedrooms
    r"\d+\s*(?:tầng|lầu)",  # floors
    r"(?:sổ đỏ|sổ hồng|chính chủ|pháp lý)",  # legal
    r"\d+(?:[.,]\d+)?\s*(?:tỷ|tỉ|triệu)",  # price
    r"(?:quận|huyện|phường|xã|đường|tp\.|thành phố)",  # location
    r"(?:căn hộ|chung cư|nhà phố|biệt thự|đất|shophouse|nhà riêng)",  # property type
]


class NLPService:
    """Vietnamese NLP pipeline for property description analysis."""

    def _match_tags(
        self,
        text_lower: str,
        tag_dict: dict[str, list[str]],
        category: str,
    ) -> list[PropertyTag]:
        tags: list[PropertyTag] = []
        seen: set[str] = set()
        for tag_name, keywords in tag_dict.items():
            for kw in keywords:
                idx = text_lower.find(kw)
                if idx != -1 and tag_name not in seen:
                    seen.add(tag_name)
                    # Extract actual matched text from original-case vicinity
                    matched = text_lower[idx : idx + len(kw)]
                    tags.append(
                        PropertyTag(
                            category=category,
                            tag=tag_name,
                            matched_text=matched,
                            confidence=0.9 if len(kw) > 3 else 0.75,
                        )
                    )
        return tags

    def _compute_completeness(self, text: str) -> float:
        matched = sum(
            1 for pattern in _COMPLETENESS_FIELDS if re.search(pattern, text, re.IGNORECASE)
        )
        return round(matched / len(_COMPLETENESS_FIELDS), 3)

    def _compute_readability(self, text: str) -> float:
        sentences = [s.strip() for s in re.split(r"[.!?;\n]+", text) if s.strip()]
        if not sentences:
            return 0.0

        avg_sentence_len = sum(len(s.split()) for s in sentences) / len(sentences)

        # Penalize very short or very long sentences
        if avg_sentence_len < 3:
            score = 0.4
        elif avg_sentence_len > 40:
            score = 0.5
        else:
            # Sweet spot: 8-20 words per sentence
            score = 1.0 - abs(avg_sentence_len - 14) / 30
            score = max(0.3, min(1.0, score))

        # Penalize excessive caps or repeated punctuation
        caps_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
        if caps_ratio > 0.3:
            score *= 0.7

        return round(score, 3)

    def _compute_info_density(self, text: str, tag_count: int) -> float:
        word_count = len(text.split())
        if word_count == 0:
            return 0.0
        # More tags per word = higher density, capped at 1.0
        density = min(1.0, (tag_count * 5) / word_count)
        return round(density, 3)

    def _tokenize(self, text: str) -> list[str]:
        try:
            from underthesea import word_tokenize

            return word_tokenize(text)
        except ImportError:
            logger.warning("underthesea not available — falling back to whitespace split")
            return text.split()

    def _sentence_split(self, text: str) -> list[str]:
        try:
            from underthesea import sent_tokenize

            return sent_tokenize(text)
        except ImportError:
            return [s.strip() for s in re.split(r"[.!?\n]+", text) if s.strip()]

    def _extract_noun_phrases(self, tokens: list[str], text: str) -> list[str]:
        """Extract key noun phrases using POS tagging when available."""
        try:
            from underthesea import pos_tag

            tagged = pos_tag(text)
            phrases: list[str] = []
            current_phrase: list[str] = []

            for word, pos in tagged:
                if pos in ("N", "Np", "Nc", "Nu", "A"):
                    current_phrase.append(word)
                else:
                    if len(current_phrase) >= 2:
                        phrases.append(" ".join(current_phrase))
                    current_phrase = []

            if len(current_phrase) >= 2:
                phrases.append(" ".join(current_phrase))

            return phrases[:20]  # Cap at 20 phrases
        except ImportError:
            return []

    def analyze(self, req: NLPAnalyzeRequest) -> NLPAnalyzeResponse:
        text = req.text
        text_lower = text.lower()

        # Auto-tag
        amenity_tags = self._match_tags(text_lower, AMENITY_TAGS, "amenity")
        location_tags = self._match_tags(text_lower, LOCATION_TAGS, "location")
        condition_tags = self._match_tags(text_lower, CONDITION_TAGS, "condition")
        all_tags = amenity_tags + location_tags + condition_tags

        # Tokenization
        tokens = self._tokenize(text)
        sentences = self._sentence_split(text)
        keyword_phrases = self._extract_noun_phrases(tokens, text)

        # Quality scoring
        completeness = self._compute_completeness(text)
        readability = self._compute_readability(text)
        info_density = self._compute_info_density(text, len(all_tags))

        # Moderation integration
        moderation_score: float | None = None
        if req.include_moderation:
            from app.services.moderation_service import moderation_service
            from app.models.moderation import ModerationRequest

            mod_result = moderation_service.check(ModerationRequest(text=text, context="listing"))
            moderation_score = mod_result.score

        # Overall quality: weighted combination
        mod_penalty = (1 - moderation_score) if moderation_score is not None else 1.0
        overall = round(
            (completeness * 0.4 + readability * 0.3 + info_density * 0.3) * mod_penalty,
            3,
        )

        quality = QualityScore(
            overall=overall,
            completeness=completeness,
            readability=readability,
            information_density=info_density,
            moderation_score=moderation_score,
        )

        return NLPAnalyzeResponse(
            tags=all_tags,
            quality=quality,
            tokens=tokens,
            sentences=sentences,
            keyword_phrases=keyword_phrases,
        )


nlp_service = NLPService()